From ff60688dc59b329cef0564b34de8a7e06834cf23 Mon Sep 17 00:00:00 2001 From: PaddlePaddle-Gardener Date: Fri, 14 Jan 2022 14:21:03 +0800 Subject: [PATCH] mirgate_38853 --- paddle/fluid/framework/operator.h | 4 +- paddle/fluid/framework/tensor.cc | 8 - paddle/fluid/framework/tensor.h | 8 - paddle/fluid/framework/tensor_util.cc | 3 +- paddle/fluid/framework/tensor_util.h | 42 +- .../inference/api/details/zero_copy_tensor.cc | 17 +- paddle/fluid/inference/lite/tensor_utils.cc | 5 +- .../memory/allocation/aligned_allocator.cc | 15 +- .../memory/allocation/aligned_allocator.h | 4 +- paddle/fluid/memory/allocation/allocator.cc | 9 +- paddle/fluid/memory/allocation/allocator.h | 110 ++- .../memory/allocation/allocator_facade.cc | 795 +++++++++++++++--- .../memory/allocation/allocator_facade.h | 30 +- .../auto_growth_best_fit_allocator.cc | 15 +- .../auto_growth_best_fit_allocator.h | 11 +- .../auto_growth_best_fit_allocator_test.cc | 8 +- .../fluid/memory/allocation/base_ptr_test.cu | 118 +++ .../memory/allocation/best_fit_allocator.cc | 6 +- .../memory/allocation/best_fit_allocator.h | 8 +- .../memory/allocation/buffered_allocator.cc | 7 +- .../memory/allocation/buffered_allocator.h | 4 +- .../allocation/buffered_allocator_test.cc | 6 +- .../fluid/memory/allocation/cpu_allocator.cc | 4 +- .../fluid/memory/allocation/cpu_allocator.h | 4 +- .../fluid/memory/allocation/cuda_allocator.cc | 14 +- .../fluid/memory/allocation/cuda_allocator.h | 4 +- .../cuda_device_context_allocator.h | 34 +- .../allocation/cuda_virtual_mem_allocator.cc | 228 +++++ .../allocation/cuda_virtual_mem_allocator.h | 62 ++ .../memory/allocation/locked_allocator.cc | 4 +- .../memory/allocation/locked_allocator.h | 4 +- .../allocation/naive_best_fit_allocator.cc | 228 +++-- .../allocation/naive_best_fit_allocator.h | 4 +- .../fluid/memory/allocation/npu_allocator.cc | 6 +- .../fluid/memory/allocation/npu_allocator.h | 4 +- .../memory/allocation/npu_pinned_allocator.cc | 31 +- .../memory/allocation/npu_pinned_allocator.h | 10 +- .../memory/allocation/pinned_allocator.cc | 12 +- .../memory/allocation/pinned_allocator.h | 4 +- .../memory/allocation/retry_allocator.cc | 4 +- .../fluid/memory/allocation/retry_allocator.h | 4 +- .../memory/allocation/retry_allocator_test.cc | 4 +- .../allocation/stream_safe_cuda_allocator.cc | 201 +++++ .../allocation/stream_safe_cuda_allocator.h | 81 ++ .../allocation/test_aligned_allocator.cc | 4 +- .../allocation/thread_local_allocator.h | 6 +- ...l_memory_auto_growth_best_fit_allocator.cc | 256 ++++++ ...al_memory_auto_growth_best_fit_allocator.h | 84 ++ paddle/fluid/memory/malloc.h | 22 +- .../fluid/operators/math/concat_and_split.cu | 38 +- .../device/mlu/device_context_allocator.h | 162 ++++ .../fluid/platform/device/npu/npu_op_runner.h | 3 +- paddle/fluid/pybind/eager_functions.cc | 174 ++++ paddle/pten/api/lib/utils/CMakeLists.txt | 2 +- paddle/pten/api/lib/utils/allocator.h | 49 ++ paddle/pten/api/lib/utils/storage.cc | 40 + paddle/pten/api/lib/utils/tensor_utils.cc | 521 ++++++++++++ paddle/pten/core/allocator.h | 153 ++++ paddle/pten/core/candidate/allocator.h | 107 +++ paddle/pten/core/dense_tensor.h | 2 + paddle/pten/core/storage.h | 1 + paddle/pten/tests/core/allocator.h | 96 +++ paddle/pten/tests/core/test_allocator.cc | 95 +++ tools/check_file_diff_approvals.sh | 19 +- 64 files changed, 3574 insertions(+), 444 deletions(-) create mode 100644 paddle/pten/api/lib/utils/allocator.h create mode 100644 paddle/pten/api/lib/utils/storage.cc create mode 100644 paddle/pten/api/lib/utils/tensor_utils.cc create mode 100644 paddle/pten/core/candidate/allocator.h diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a46c83a2b..09e4abc77f 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -410,8 +410,8 @@ class ExecutionContext { auto tmp_allocation_ptr = memory::Alloc(dev_ctx, product(dim) * sizeof(T)); auto& deleter = tmp_allocation_ptr.get_deleter(); auto* allocation_ptr = tmp_allocation_ptr.release(); - auto shared_allocation = std::shared_ptr( - allocation_ptr, deleter); + auto shared_allocation = + std::shared_ptr(allocation_ptr, deleter); PADDLE_ENFORCE_GE( allocation_ptr->size(), framework::product(dim) * sizeof(T), diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index f11b37825d..6aa10a0580 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -17,14 +17,6 @@ limitations under the License. */ DECLARE_bool(use_stream_safe_cuda_allocator); -namespace paddle { -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -} // namespace paddle - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index e86009e9aa..fcdb837bc8 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -32,14 +32,6 @@ limitations under the License. */ #include "paddle/pten/core/dense_tensor.h" -namespace paddle { -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -} // namespace paddle - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 7fd125834a..5fd5812200 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -151,8 +151,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(npu_pinned_place) .get()); - paddle::memory::allocation::Allocation* allocation = - npu_pinned_tensor.Holder().get(); + pten::Allocation* allocation = npu_pinned_tensor.Holder().get(); npu_pinned_allocator->RecordEvent( allocation, reinterpret_cast(ctx).stream()); diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 73829898be..11858e4166 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -30,6 +30,11 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif #include "paddle/fluid/platform/device_context.h" +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/device_context.h" +#endif + +#include "paddle/pten/core/dense_tensor.h" namespace paddle { namespace framework { @@ -72,6 +77,8 @@ class Tensor; void TensorCopy(const Tensor& src, const platform::Place& dst_place, const platform::DeviceContext& ctx, Tensor* dst); +void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place, + const platform::DeviceContext& ctx, pten::DenseTensor* dst); // NOTE(zcd): If the src.place() and dst_place are two different GPU, // the copy operation is carried out on the dst_place's stream. This is @@ -82,6 +89,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, // not completed. void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst); +void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place, + pten::DenseTensor* dst); void TensorCopySync(const Tensor& src, const platform::Place& dst_place, Tensor* dst); @@ -174,8 +183,7 @@ void TensorFromArray(const T* src, const size_t& array_size, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(npu_pinned_place) .get()); - paddle::memory::allocation::Allocation* allocation = - npu_pinned_tensor.Holder().get(); + pten::Allocation* allocation = npu_pinned_tensor.Holder().get(); npu_pinned_allocator->RecordEvent( allocation, reinterpret_cast(ctx).stream()); @@ -232,13 +240,20 @@ void TensorFromVector(const std::vector& src, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(npu_pinned_place) .get()); - paddle::memory::allocation::Allocation* allocation = - npu_pinned_tensor.Holder().get(); + pten::Allocation* allocation = npu_pinned_tensor.Holder().get(); npu_pinned_allocator->RecordEvent( allocation, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(dst_place)) { + memory::Copy( + BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr, src_place, + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif } // The fully specialized function should be inline to avoid @@ -295,8 +310,7 @@ inline void TensorFromVector(const std::vector& src, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(npu_pinned_place) .get()); - paddle::memory::allocation::Allocation* allocation = - npu_pinned_tensor.Holder().get(); + pten::Allocation* allocation = npu_pinned_tensor.Holder().get(); npu_pinned_allocator->RecordEvent( allocation, reinterpret_cast(ctx).stream()); @@ -371,6 +385,14 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, size, nullptr); } #endif +#ifdef PADDLE_WITH_MLU + else if (platform::is_mlu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif } template <> @@ -412,6 +434,14 @@ inline void TensorToVector(const Tensor& src, BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, size, nullptr); } +#endif +#ifdef PADDLE_WITH_MLU + else if (platform::is_mlu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } #endif for (unsigned int i = 0; i < src.numel(); i++) { (*dst)[i] = static_cast(array[i]); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index bb537f0c65..2f2f4c0ead 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -223,9 +223,10 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, auto t_place = tensor->place(); paddle::framework::Tensor out; - auto mem_allocation = std::make_shared( - static_cast(data), ele_num * sizeof(T), - paddle::platform::CPUPlace()); + auto mem_allocation = + std::make_shared( + static_cast(data), ele_num * sizeof(T), + paddle::platform::CPUPlace()); out.ResetHolder(mem_allocation); if (paddle::platform::is_cpu_place(t_place)) { @@ -239,6 +240,14 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); #else std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); +#endif + } else if (paddle::platform::is_ipu_place(t_place)) { +#ifdef PADDLE_WITH_IPU + std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); +#else + PADDLE_THROW(paddle::platform::errors::Unavailable( + "Can not create tensor with IPU place because paddle is not compiled " + "with IPU.")); #endif } else if (place_ == PlaceType::kGPU) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -290,7 +299,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, paddle::memory::Copy(paddle::platform::CPUPlace(), static_cast(data), npu_place, t_data, ele_num * sizeof(T), dev_ctx->stream()); - aclrtSynchronizeStream(dev_ctx->stream()); + paddle::platform::NPUStreamSync(dev_ctx->stream()); #else PADDLE_THROW(paddle::platform::errors::Unavailable( "Can not create tensor with NPU place because paddle is not compiled " diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index b1e0eb5ef1..0d5cd29a0c 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -257,9 +257,8 @@ void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) { size_t memory_size = GetLiteTensorNumel(*src) * framework::SizeOfType(GetNativePrecisionType(src->precision())); - std::shared_ptr holder( - new memory::allocation::Allocation(src_raw_data, memory_size, - GetNativePlace(src->target()))); + std::shared_ptr holder(new pten::Allocation( + src_raw_data, memory_size, GetNativePlace(src->target()))); dst->Resize(paddle::framework::make_ddim(src->shape())); SetLoD(dst->mutable_lod(), src->lod()); dst->ResetHolderWithType(holder, GetNativePrecisionType(src->precision())); diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc index f0b7f1a4b0..258cff32b4 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -23,15 +23,16 @@ namespace allocation { // For memory address alignment class AlignedAllocation : public Allocation { public: - AlignedAllocation(AllocationPtr underlying_allocation, size_t offset) + AlignedAllocation(DecoratedAllocationPtr underlying_allocation, size_t offset) : Allocation( reinterpret_cast(underlying_allocation->ptr()) + offset, + underlying_allocation->base_ptr(), underlying_allocation->size() - offset, underlying_allocation->place()), underlying_allocation_(std::move(underlying_allocation)) {} private: - AllocationPtr underlying_allocation_; + DecoratedAllocationPtr underlying_allocation_; }; AlignedAllocator::AlignedAllocator( @@ -51,13 +52,17 @@ bool AlignedAllocator::IsAllocThreadSafe() const { return underlying_allocator_->IsAllocThreadSafe(); } -Allocation* AlignedAllocator::AllocateImpl(size_t size) { +pten::Allocation* AlignedAllocator::AllocateImpl(size_t size) { auto raw_allocation = underlying_allocator_->Allocate(size + alignment_); size_t offset = AlignedPtrOffset(raw_allocation->ptr(), alignment_); - return new AlignedAllocation(std::move(raw_allocation), offset); + auto* p = new AlignedAllocation( + static_unique_ptr_cast(std::move(raw_allocation)), offset); + return p; } -void AlignedAllocator::FreeImpl(Allocation* allocation) { delete allocation; } +void AlignedAllocator::FreeImpl(pten::Allocation* allocation) { + delete allocation; +} } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h index 6fef5cae8d..ffd5ad0fae 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.h +++ b/paddle/fluid/memory/allocation/aligned_allocator.h @@ -30,9 +30,9 @@ class AlignedAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - Allocation* AllocateImpl(size_t size) override; + pten::Allocation* AllocateImpl(size_t size) override; - void FreeImpl(Allocation* allocation) override; + void FreeImpl(pten::Allocation* allocation) override; private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc index 4998f3dbb9..0ef6f5cbab 100644 --- a/paddle/fluid/memory/allocation/allocator.cc +++ b/paddle/fluid/memory/allocation/allocator.cc @@ -18,11 +18,10 @@ namespace paddle { namespace memory { namespace allocation { -bool Allocator::IsAllocThreadSafe() const { return false; } - -void Allocator::FreeImpl(Allocation* allocation) { - Allocator* allocator = allocation->TopDecoratedAllocator(); - allocator->Free(allocation); +void Allocator::FreeImpl(pten::Allocation* allocation) { + static_cast(allocation) + ->TopDecoratedAllocator() + ->Free(allocation); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index b11c657b96..3f04d47516 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -22,6 +22,9 @@ #include "paddle/fluid/framework/inlined_vector.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/pten/core/allocator.h" + +DECLARE_string(allocator_strategy); namespace paddle { namespace memory { @@ -78,40 +81,26 @@ class Allocator; * e.g., something what is done in AlignedAllocator, etc. * In this case, we should declare a derived class of Allocation, which * contains an underlying Allocation allocated by the underlying allocator. - * Therefore, `decorated_allocators_` of the new Allocation object would + * Therefore, `decorated_allocators_` of the new Allocation object + * would * be a new chain, differing from the underlying Allocation object. */ -class Allocation { +class Allocation : public pten::Allocation { public: - inline Allocation(void* ptr, size_t size, platform::Place place) - : ptr_(ptr), size_(size), place_(place) {} - - Allocation(const Allocation& o) = delete; - Allocation& operator=(const Allocation& o) = delete; - Allocation(Allocation&& o) = delete; - Allocation& operator=(Allocation&& o) = delete; - - // Returns the holding pointer. - // NOTE: For performance consideration, it is better not to make this method - // as a virtual method. If we want to implement a `defragmentation` later, - // we might need to make `ptr_` field as a protected field, and add a virtual - // method like `defragmentation` to change `ptr_`. - inline void* ptr() const { return ptr_; } - - // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the - // last valid element. - // - // NOTE: Some allocator might alloc more memory than request. The size - // could larger than its request. For example, - // the AlignedAllocator will always allocate memory as size + kAlignment. - // The raw pointer might not aligned, so an offset might be added to raw - // the pointer. The size of this allocation will be - // `size + kAlignemnt - offset`. - inline size_t size() const { return size_; } - - inline const platform::Place& place() const { return place_; } - - virtual ~Allocation() {} + Allocation(void* ptr, size_t size, platform::Place place) + : pten::Allocation(ptr, size, place), base_ptr_(ptr) {} + Allocation(void* ptr, void* base_ptr, size_t size, + const platform::Place& place) + : pten::Allocation(ptr, size, place), base_ptr_(base_ptr) {} + + void* base_ptr() const { + PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth", + paddle::platform::errors::Unimplemented( + "base_ptr() is only implemented for auto_growth " + "strategy, not support %s strategy", + FLAGS_allocator_strategy)); + return base_ptr_; + } private: inline void RegisterDecoratedAllocator(Allocator* allocator) { @@ -125,9 +114,7 @@ class Allocation { } private: - void* ptr_; - size_t size_; - platform::Place place_; + void* base_ptr_; // the point that directly requested from system /** * NOTE(zjl): Since decorated_allocators_ is usually a small vector. @@ -147,53 +134,42 @@ class Allocation { friend class Allocator; }; +using AllocationPtr = pten::Allocator::AllocationPtr; +using DecoratedAllocationPtr = + std::unique_ptr; + // Base interface class of memory Allocator. -class Allocator { +class Allocator : public pten::Allocator { public: - virtual ~Allocator() {} - - class AllocationDeleter { - public: - inline void operator()(Allocation* allocation) const { - Allocator* allocator = allocation->TopDecoratedAllocator(); - allocator->Free(allocation); - } - }; - - using AllocationPtr = std::unique_ptr; + static void AllocationDeleter(pten::Allocation* allocation) { + Allocator* allocator = + static_cast(allocation)->TopDecoratedAllocator(); + allocator->Free(allocation); + } // Allocate an allocation. // size may be 0, but it would be too complex if we handle size == 0 // in each Allocator. So we handle size == 0 inside AllocatorFacade // in our design. - inline AllocationPtr Allocate(size_t size) { + AllocationPtr Allocate(size_t size) override { auto ptr = AllocateImpl(size); - ptr->RegisterDecoratedAllocator(this); - return AllocationPtr(ptr); + static_cast(ptr)->RegisterDecoratedAllocator(this); + return AllocationPtr(ptr, AllocationDeleter); } - // This function should not be called outside Allocator class - inline void Free(Allocation* allocation) { - allocation->PopDecoratedAllocator(); + void Free(pten::Allocation* allocation) { + static_cast(allocation)->PopDecoratedAllocator(); FreeImpl(allocation); } - inline uint64_t Release(const platform::Place& place) { - return ReleaseImpl(place); - } - - // True if the `Allocate` is thread safe. - virtual bool IsAllocThreadSafe() const; + uint64_t Release(const platform::Place& place) { return ReleaseImpl(place); } protected: - virtual Allocation* AllocateImpl(size_t size) = 0; - virtual void FreeImpl(Allocation* allocation); + virtual pten::Allocation* AllocateImpl(size_t size) = 0; + virtual void FreeImpl(pten::Allocation* allocation); virtual uint64_t ReleaseImpl(const platform::Place& place) { return 0; } }; -using AllocationDeleter = Allocator::AllocationDeleter; -using AllocationPtr = Allocator::AllocationPtr; - inline size_t AlignedSize(size_t size, size_t alignment) { auto remaining = size % alignment; return remaining == 0 ? size : size + alignment - remaining; @@ -205,6 +181,14 @@ inline size_t AlignedPtrOffset(const void* ptr, size_t alignment) { return diff == 0 ? 0 : alignment - diff; } +template +decltype(auto) static_unique_ptr_cast(std::unique_ptr&& p) { + static_assert(std::is_base_of::value, + "Derived type must derive from Base."); + auto d = static_cast(p.release()); + return std::unique_ptr(d, p.get_deleter()); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 281902f3a2..474b4fe3d4 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -15,31 +15,51 @@ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "gflags/gflags.h" +#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" -#endif -#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include #include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/pinned_allocator.h" +#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/memory/allocation/thread_local_allocator.h" -#include "paddle/fluid/platform/gpu_info.h" -#endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device_context.h" + #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cuda_graph.h" +#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" +#endif + +#if CUDA_VERSION >= 10020 +#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h" +#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h" +#include "paddle/fluid/platform/dynload/cuda_driver.h" +#endif #endif + #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/platform/xpu/xpu_info.h" +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#endif + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" +#endif + +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/device/ipu/ipu_info.h" +#endif + +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" #endif -#include "paddle/fluid/platform/npu_info.h" PADDLE_DEFINE_EXPORTED_int64( gpu_allocator_retry_time, 10000, @@ -51,6 +71,15 @@ PADDLE_DEFINE_EXPORTED_bool( "Whether to use system allocator to allocate CPU and GPU memory. " "Only used for unittests."); +PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false, + "Use VirtualMemoryAutoGrowthBestFitAllocator."); + +// NOTE(Ruibiao): This FLAGS is just to be compatibled with +// the old single-stream CUDA allocator. It will be removed +// after StreamSafeCudaAllocator has been fully tested. +PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false, + "Enable StreamSafeCUDAAllocator"); + DECLARE_string(allocator_strategy); namespace paddle { @@ -65,16 +94,16 @@ class CUDAGraphAllocator class PrivateAllocation : public Allocation { public: PrivateAllocation(CUDAGraphAllocator* allocator, - AllocationPtr underlying_allocation) - : Allocation(underlying_allocation->ptr(), - underlying_allocation->size(), - underlying_allocation->place()), + DecoratedAllocationPtr underlying_allocation) + : Allocation( + underlying_allocation->ptr(), underlying_allocation->base_ptr(), + underlying_allocation->size(), underlying_allocation->place()), allocator_(allocator->shared_from_this()), underlying_allocation_(std::move(underlying_allocation)) {} private: std::shared_ptr allocator_; - AllocationPtr underlying_allocation_; + DecoratedAllocationPtr underlying_allocation_; }; explicit CUDAGraphAllocator(const std::shared_ptr& allocator) @@ -87,12 +116,14 @@ class CUDAGraphAllocator } protected: - Allocation* AllocateImpl(size_t size) { + pten::Allocation* AllocateImpl(size_t size) { VLOG(10) << "Allocate " << size << " for CUDA Graph"; - return new PrivateAllocation(this, underlying_allocator_->Allocate(size)); + return new PrivateAllocation(this, + static_unique_ptr_cast( + underlying_allocator_->Allocate(size))); } - void FreeImpl(Allocation* allocation) { + void FreeImpl(pten::Allocation* allocation) { VLOG(10) << "delete for CUDA Graph"; delete allocation; } @@ -106,46 +137,85 @@ class AllocatorFacadePrivate { public: using AllocatorMap = std::map>; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + using CUDAAllocatorMap = + std::map>>; +#endif + explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) { strategy_ = GetAllocatorStrategy(); switch (strategy_) { case AllocatorStrategy::kNaiveBestFit: { InitNaiveBestFitCPUAllocator(); -#ifdef PADDLE_WITH_XPU - for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { - InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); +#ifdef PADDLE_WITH_IPU + for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) { + InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); } #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); - ++dev_id) { + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, false, + paddle::platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is only implemented for auto_growth " + "strategy, not support naive_best_fit strategy")); + + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } InitNaiveBestFitCUDAPinnedAllocator(); #endif +#ifdef PADDLE_WITH_XPU + for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { + InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); + } +#endif #ifdef PADDLE_WITH_ASCEND_CL for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); } InitNaiveBestFitNPUPinnedAllocator(); +#endif +#ifdef PADDLE_WITH_MLU + for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) { + InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id)); + } #endif break; } case AllocatorStrategy::kAutoGrowth: { InitNaiveBestFitCPUAllocator(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + allow_free_idle_chunk_ = allow_free_idle_chunk; + if (FLAGS_use_stream_safe_cuda_allocator) { + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); + ++dev_id) { + InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr); + } + } else { + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); + ++dev_id) { + InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), + allow_free_idle_chunk_); + } + } + InitNaiveBestFitCUDAPinnedAllocator(); +#endif #ifdef PADDLE_WITH_XPU for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); - ++dev_id) { - InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), - allow_free_idle_chunk); +#ifdef PADDLE_WITH_IPU + for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) { + InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); + } +#endif +#ifdef PADDLE_WITH_MLU + for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) { + InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id)); } - InitNaiveBestFitCUDAPinnedAllocator(); #endif break; } @@ -157,12 +227,27 @@ class AllocatorFacadePrivate { InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); } #endif +#ifdef PADDLE_WITH_IPU + for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) { + InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); - ++dev_id) { + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, false, + paddle::platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is only implemented for auto_growth " + "strategy, not support thread_local strategy")); + + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); } InitNaiveBestFitCUDAPinnedAllocator(); +#endif +#ifdef PADDLE_WITH_MLU + for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) { + InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id)); + } #endif break; } @@ -182,27 +267,9 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(); } - inline const AllocatorMap& GetAllocatorMap() { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - auto id = platform::CUDAGraph::CapturingID(); - auto iter = cuda_graph_allocator_map_.find(id); - PADDLE_ENFORCE_NE( - iter, cuda_graph_allocator_map_.end(), - platform::errors::PermissionDenied( - "No memory pool is prepared for CUDA Graph capturing.")); - return iter->second->allocators_; - } else { - return allocators_; - } -#else - return allocators_; -#endif - } - inline const std::shared_ptr& GetAllocator( const platform::Place& place, size_t size) { - VLOG(4) << "GetAllocator" + VLOG(6) << "GetAllocator" << " " << place << " " << size; const auto& allocators = (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ @@ -215,25 +282,142 @@ class AllocatorFacadePrivate { return iter->second; } - private: - void InitSystemAllocators() { - if (!system_allocators_.empty()) return; - system_allocators_[platform::CPUPlace()] = std::make_shared(); -#ifdef PADDLE_WITH_XPU - int device_count = platform::GetXPUDeviceCount(); - for (int i = 0; i < device_count; ++i) { - platform::XPUPlace p(i); - system_allocators_[p] = std::make_shared(p); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + bool HasCUDAAllocator(const platform::CUDAPlace& place, + const gpuStream_t& stream) { + auto it = cuda_allocators_.find(place); + if (it == cuda_allocators_.end()) { + return false; + } + const std::map>& allocator_map = + it->second; + return allocator_map.find(stream) != allocator_map.end(); + } + + const std::shared_ptr& GetAllocator( + const platform::CUDAPlace& place, const gpuStream_t& stream, + bool create_if_not_found = false) { + { // shared_lock_guard + std::shared_lock lock_guard( + cuda_allocator_mutex_); + if (LIKELY(HasCUDAAllocator(place, stream))) { + return cuda_allocators_[place][stream]; + } else { + PADDLE_ENFORCE_NE(create_if_not_found, false, + platform::errors::NotFound( + "No allocator found for stream %s in place %s " + "with create_if_not_found = false", + stream, place)); + } } + + { // unique_lock_guard + std::unique_lock lock_guard( + cuda_allocator_mutex_); + InitStreamSafeCUDAAllocator(place, stream); + return cuda_allocators_[place][stream]; + } + } + + gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + return static_cast(pool.Get(place))->stream(); + } + + void RecordStream(std::shared_ptr allocation, + const gpuStream_t& stream) { + if (allocation->size() == 0) { + return; + } + + StreamSafeCUDAAllocation* stream_safe_cuda_allocation = + dynamic_cast(allocation.get()); + PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation, + platform::errors::InvalidArgument( + "Failed to dynamic cast %p from Allocation* to " + "StreamSafeCUDAAllocation*", + allocation.get())); + stream_safe_cuda_allocation->RecordStream(stream); + } + + const gpuStream_t& GetStream( + const std::shared_ptr& allocation) const { + const StreamSafeCUDAAllocation* stream_safe_cuda_allocation = + dynamic_cast(allocation.get()); + PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation, + platform::errors::InvalidArgument( + "Failed to dynamic cast %p from Allocation* to " + "StreamSafeCUDAAllocation*", + allocation.get())); + return stream_safe_cuda_allocation->GetOwningStream(); + } + +#ifdef PADDLE_WITH_CUDA + void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { + PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when the " + "FLAGS_allocator_strategy=\"auto_growth\", but got " + "FLAGS_allocator_strategy=\"%s\"", + FLAGS_allocator_strategy)); + auto& allocator = cuda_graph_allocator_map_[id]; + PADDLE_ENFORCE_EQ( + allocator.get(), nullptr, + platform::errors::InvalidArgument( + "The memory pool of the CUDA Graph with ID %d have been prepared.", + id)); + allocator.reset( + new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); + for (auto& item : allocator->allocators_) { + auto& old_allocator = item.second; + old_allocator = CUDAGraphAllocator::Create(old_allocator); + } + VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; + } + + void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { + auto iter = cuda_graph_allocator_map_.find(id); + PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(), + platform::errors::InvalidArgument( + "Cannot find CUDA Graph with ID = %d", id)); + cuda_graph_allocator_map_.erase(iter); + VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; + } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - system_allocators_[platform::CUDAPinnedPlace()] = - std::make_shared(); - int device_count = platform::GetCUDADeviceCount(); - for (int i = 0; i < device_count; ++i) { - platform::CUDAPlace p(i); - system_allocators_[p] = std::make_shared(p); +#endif + + private: + class ZeroSizeAllocator : public Allocator { + public: + explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} + bool IsAllocThreadSafe() const override { return true; } + + protected: + pten::Allocation* AllocateImpl(size_t size) override { + return new Allocation(nullptr, 0, place_); + } + void FreeImpl(pten::Allocation* allocation) override { delete allocation; } + + private: + platform::Place place_; + }; + + const AllocatorMap& GetAllocatorMap() { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + auto id = platform::CUDAGraph::CapturingID(); + auto iter = cuda_graph_allocator_map_.find(id); + PADDLE_ENFORCE_NE( + iter, cuda_graph_allocator_map_.end(), + platform::errors::PermissionDenied( + "No memory pool is prepared for CUDA Graph capturing.")); + VLOG(10) << "Choose CUDA Graph memory pool to allocate memory"; + return iter->second->allocators_; + } else { + return allocators_; } +#else + return allocators_; #endif } @@ -248,16 +432,138 @@ class AllocatorFacadePrivate { std::make_shared(platform::CUDAPinnedPlace()); } + void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { + PADDLE_ENFORCE_EQ( + strategy_, AllocatorStrategy::kAutoGrowth, + platform::errors::Unimplemented( + "Only support auto-growth strategey for StreamSafeCUDAAllocator, " + "the allocator strategy %d is unsupported for multi-stream", + static_cast(strategy_))); + if (LIKELY(!HasCUDAAllocator(p, stream))) { + VLOG(8) << "Init CUDA allocator for stream " << stream << " in place " + << p; + InitAutoGrowthCUDAAllocator(p, stream); + WrapStreamSafeCUDAAllocator(p, stream); + WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time); + } + } + void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) { allocators_[p] = std::make_shared(p); } - void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) { - allocators_[p] = std::make_shared(p); + void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { +#if defined(PADDLE_WITH_HIP) + auto cuda_allocator = std::make_shared(p); + cuda_allocators_[p][stream] = std::make_shared( + cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_); +#endif + +#if defined(PADDLE_WITH_CUDA) +#if CUDA_VERSION >= 10020 + CUdevice device; + int val; + try { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId())); + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cuDeviceGetAttribute( + &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, + device)); + } catch (...) { + val = 0; + } + + if (val > 0 && FLAGS_use_virtual_memory_auto_growth) { + auto cuda_allocator = std::make_shared(p); + cuda_allocators_[p][stream] = + std::make_shared( + cuda_allocator, platform::GpuMinChunkSize(), p); + } else { + auto cuda_allocator = std::make_shared(p); + cuda_allocators_[p][stream] = + std::make_shared( + cuda_allocator, platform::GpuMinChunkSize(), + allow_free_idle_chunk_); + } +#else + auto cuda_allocator = std::make_shared(p); + auto alignment = platform::GpuMinChunkSize(); + bool need_addr_align = true; + // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda + // API in that case may got cuda error(3), i.e., + // cudaErrorInitializationError. And, the CUDAAllocator is only initialized + // but not really used. + // Here, the try-catch block is added to handle the case that + // GetDeviceProperties() may failed in the multiple process(for example, in + // dataloader with num_worker > 0) + try { + const auto& prop = platform::GetDeviceProperties(p.GetDeviceId()); + need_addr_align = prop.textureAlignment < alignment; + VLOG(4) << "GetDeviceProperties ok, textureAlignment: " + << prop.textureAlignment + << ", set need_addr_align=" << need_addr_align; + } catch (...) { + need_addr_align = true; + VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true"; + } + // The address returned is aligned already, + // ref: + // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295 + std::shared_ptr underlying_allocator{nullptr}; + if (need_addr_align) { + VLOG(10) << "use AlignedAllocator with alignment: " << alignment; + underlying_allocator = + std::make_shared(underlying_allocator, alignment); + } else { + VLOG(10) << "not use AlignedAllocator with alignment: " << alignment; + underlying_allocator = cuda_allocator; + } + + cuda_allocators_[p][stream] = std::make_shared( + underlying_allocator, alignment, 0, allow_free_idle_chunk_); +#endif +#endif } + // NOTE(Ruibiao): Old single-stream version, will be removed later void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, bool allow_free_idle_chunk) { +#if defined(PADDLE_WITH_HIP) + auto cuda_allocator = std::make_shared(p); + allocators_[p] = std::make_shared( + cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); +#endif + +#if defined(PADDLE_WITH_CUDA) +#if CUDA_VERSION >= 10020 + CUdevice device; + int val; + try { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId())); + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cuDeviceGetAttribute( + &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, + device)); + } catch (...) { + val = 0; + } + + if (val > 0 && FLAGS_use_virtual_memory_auto_growth) { + auto cuda_allocator = std::make_shared(p); + allocators_[p] = + std::make_shared( + cuda_allocator, platform::GpuMinChunkSize(), p); + } else { + auto cuda_allocator = std::make_shared(p); + allocators_[p] = std::make_shared( + cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); + } + +#else auto cuda_allocator = std::make_shared(p); auto alignment = platform::GpuMinChunkSize(); bool need_addr_align = true; @@ -292,6 +598,39 @@ class AllocatorFacadePrivate { } allocators_[p] = std::make_shared( underlying_allocator, alignment, 0, allow_free_idle_chunk); +#endif +#endif + } + + void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) { + allocators_[p] = std::make_shared(p); + } + + void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { + const std::shared_ptr& underlying_allocator = + cuda_allocators_[p][stream]; + cuda_allocators_[p][stream] = std::make_shared( + underlying_allocator, p, stream); + } + + void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, + size_t retry_time) { + PADDLE_ENFORCE_GT( + retry_time, 0, + platform::errors::InvalidArgument( + "Retry time should be larger than 0, but got %d", retry_time)); + std::shared_ptr allocator = cuda_allocators_[p][stream]; + allocator = std::make_shared(allocator, retry_time); + } + + static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) { + for (auto& place_pair : allocators) { + for (auto& stream_pair : place_pair.second) { + PADDLE_ENFORCE_EQ(stream_pair.second->IsAllocThreadSafe(), true, + platform::errors::InvalidArgument( + "Public allocators must be thread safe")); + } + } } #endif @@ -301,6 +640,18 @@ class AllocatorFacadePrivate { } #endif +#ifdef PADDLE_WITH_IPU + void InitNaiveBestFitIPUAllocator(platform::IPUPlace p) { + allocators_[p] = std::make_shared(p); + } +#endif + +#ifdef PADDLE_WITH_MLU + void InitNaiveBestFitMLUAllocator(platform::MLUPlace p) { + allocators_[p] = std::make_shared(p); + } +#endif + #ifdef PADDLE_WITH_ASCEND_CL void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) { allocators_[p] = std::make_shared(p); @@ -310,32 +661,49 @@ class AllocatorFacadePrivate { allocators_[platform::NPUPinnedPlace()] = std::make_shared(); } - #endif - class ZeroSizeAllocator : public Allocator { - public: - explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} - - bool IsAllocThreadSafe() const override { return true; } - - protected: - Allocation* AllocateImpl(size_t size) override { - return new Allocation(nullptr, 0, place_); + void InitSystemAllocators() { + if (!system_allocators_.empty()) return; + system_allocators_[platform::CPUPlace()] = std::make_shared(); +#ifdef PADDLE_WITH_XPU + int device_count = platform::GetXPUDeviceCount(); + for (int i = 0; i < device_count; ++i) { + platform::XPUPlace p(i); + system_allocators_[p] = std::make_shared(p); } - - void FreeImpl(Allocation* allocation) override { delete allocation; } - - private: - platform::Place place_; - }; +#endif +#ifdef PADDLE_WITH_IPU + int device_count = platform::GetIPUDeviceCount(); + for (int i = 0; i < device_count; ++i) { + platform::IPUPlace p(i); + system_allocators_[p] = std::make_shared(p); + } +#endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + system_allocators_[platform::CUDAPinnedPlace()] = + std::make_shared(); + int device_count = platform::GetGPUDeviceCount(); + for (int i = 0; i < device_count; ++i) { + platform::CUDAPlace p(i); + system_allocators_[p] = std::make_shared(p); + } +#endif +#ifdef PADDLE_WITH_MLU + int device_count = platform::GetMLUDeviceCount(); + for (int i = 0; i < device_count; ++i) { + platform::MLUPlace p(i); + system_allocators_[p] = std::make_shared(p); + } +#endif + } void InitZeroSizeAllocators() { if (!zero_size_allocators_.empty()) return; std::vector places; places.emplace_back(platform::CPUPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - int device_count = platform::GetCUDADeviceCount(); + int device_count = platform::GetGPUDeviceCount(); for (int dev_id = 0; dev_id < device_count; ++dev_id) { places.emplace_back(platform::CUDAPlace(dev_id)); } @@ -353,6 +721,18 @@ class AllocatorFacadePrivate { places.emplace_back(platform::NPUPlace(dev_id)); } #endif +#ifdef PADDLE_WITH_IPU + int device_count = platform::GetIPUDeviceCount(); + for (int dev_id = 0; dev_id < device_count; ++dev_id) { + places.emplace_back(platform::IPUPlace(dev_id)); + } +#endif +#ifdef PADDLE_WITH_MLU + int device_count = platform::GetMLUDeviceCount(); + for (int dev_id = 0; dev_id < device_count; ++dev_id) { + places.emplace_back(platform::MLUPlace(dev_id)); + } +#endif for (auto& p : places) { zero_size_allocators_[p] = std::make_shared(p); @@ -371,8 +751,14 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(allocators_); CheckAllocThreadSafe(zero_size_allocators_); CheckAllocThreadSafe(system_allocators_); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (FLAGS_use_stream_safe_cuda_allocator) { + CheckCUDAAllocThreadSafe(cuda_allocators_); + } +#endif } + // NOTE(Ruibiao): Old single-stream version, will be removed later void WrapCUDARetryAllocator(size_t retry_time) { PADDLE_ENFORCE_GT( retry_time, 0, @@ -385,53 +771,21 @@ class AllocatorFacadePrivate { } } -#ifdef PADDLE_WITH_CUDA - - public: - void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { - PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, - platform::errors::InvalidArgument( - "CUDA Graph is only supported when the " - "FLAGS_allocator_strategy=\"auto_growth\", but got " - "FLAGS_allocator_strategy=\"%s\"", - FLAGS_allocator_strategy)); - auto& allocator = cuda_graph_allocator_map_[id]; - PADDLE_ENFORCE_EQ( - allocator.get(), nullptr, - platform::errors::InvalidArgument( - "The memory pool of the CUDA Graph with ID %d have been prepared.", - id)); - allocator.reset( - new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); - for (auto& item : allocator->allocators_) { - auto& old_allocator = item.second; - old_allocator = CUDAGraphAllocator::Create(old_allocator); - } - VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; - } - - void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { - auto iter = cuda_graph_allocator_map_.find(id); - PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(), - platform::errors::InvalidArgument( - "Cannot find CUDA Graph with ID = %d", id)); - cuda_graph_allocator_map_.erase(iter); - VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; - } -#endif - - private: - AllocatorMap allocators_; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // a standalone CUDA allocator to support multi-stream GC in new executor + CUDAAllocatorMap cuda_allocators_; + std::shared_timed_mutex cuda_allocator_mutex_; #ifdef PADDLE_WITH_CUDA std::unordered_map> cuda_graph_allocator_map_; +#endif #endif AllocatorStrategy strategy_; - + AllocatorMap allocators_; static AllocatorMap zero_size_allocators_; static AllocatorMap system_allocators_; + bool allow_free_idle_chunk_; }; - AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::zero_size_allocators_; AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_; @@ -447,24 +801,207 @@ AllocatorFacade& AllocatorFacade::Instance() { return instance; } -std::shared_ptr AllocatorFacade::AllocShared( +const std::shared_ptr& AllocatorFacade::GetAllocator( + const platform::Place& place) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && + FLAGS_use_system_allocator == false) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + return m_->GetAllocator(place, + /* A non-zero num to choose allocator_ */ 1); + } +#endif + + platform::CUDAPlace cuda_place = + BOOST_GET_CONST(platform::CUDAPlace, place); + return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place)); + } +#endif + + return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); +} + +std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size) { - return std::shared_ptr(Alloc(place, size)); + return std::shared_ptr(Alloc(place, size)); } AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && + size > 0 && FLAGS_use_system_allocator == false) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + return m_->GetAllocator(place, size)->Allocate(size); + } +#endif + + platform::CUDAPlace cuda_place = + BOOST_GET_CONST(platform::CUDAPlace, place); + return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place)); + } +#endif + return m_->GetAllocator(place, size)->Allocate(size); } uint64_t AllocatorFacade::Release(const platform::Place& place) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && + FLAGS_use_system_allocator == false) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + return m_ + ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) + ->Release(place); + } +#endif + + platform::CUDAPlace cuda_place = + BOOST_GET_CONST(platform::CUDAPlace, place); + return Release(cuda_place, m_->GetDefaultStream(cuda_place)); + } +#endif return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) ->Release(place); } -const std::shared_ptr& AllocatorFacade::GetAllocator( - const platform::Place& place) { - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); +std::shared_ptr AllocatorFacade::AllocShared( + const platform::Place& place, size_t size, const platform::Stream& stream) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, true, + platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is disabled, you should not call this " + "multi-stream 'AllocaShared' function. To enable it, you can enter" + "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " + "terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + gpuStream_t s = reinterpret_cast(stream.id()); + return std::shared_ptr(Alloc(place, size, s)); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); +#endif +} + +bool AllocatorFacade::InSameStream( + const std::shared_ptr& allocation, + const platform::Stream& stream) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, true, + platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is disabled, you should not call this " + "multi-stream 'InSameStream' function. To enable it, you can enter" + "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " + "terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + gpuStream_t s = reinterpret_cast(stream.id()); + return s == GetStream(allocation); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); +#endif +} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, + const gpuStream_t& stream) { + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, true, + platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is disabled, you should not call this " + "multi-stream 'Alloc' function. To enable it, you can enter" + "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " + "terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + + platform::CUDAPlace p = BOOST_GET_CONST(platform::CUDAPlace, place); + if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { + return m_->GetAllocator(p, stream, /* create_if_not_found = */ true) + ->Allocate(size); + } else { + return m_->GetAllocator(p, size)->Allocate(size); + } +} + +uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, + const gpuStream_t& stream) { + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, true, + platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is disabled, you should not call this " + "multi-stream 'Release' function. To enable it, you can enter" + "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " + "terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + + return m_->GetAllocator(place, stream)->Release(place); +} + +void AllocatorFacade::RecordStream(std::shared_ptr allocation, + const gpuStream_t& stream) { + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, true, + platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is disabled, you should not call this " + "'RecordStream' function. To enable it, you can enter" + "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " + "terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + + m_->RecordStream(allocation, stream); +} + +const gpuStream_t& AllocatorFacade::GetStream( + const std::shared_ptr& allocation) const { + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, true, + platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is disabled, you should not call this " + "'GetStream' function. To enable it, you can enter" + "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " + "terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + + return m_->GetStream(allocation); } #ifdef PADDLE_WITH_CUDA @@ -476,7 +1013,7 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { return m_->RemoveMemoryPoolOfCUDAGraph(id); } #endif - +#endif } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 8d889ec38e..76e2f0b5a9 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -19,13 +19,15 @@ #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/stream/stream.h" namespace paddle { namespace memory { namespace allocation { + #ifdef PADDLE_WITH_ASCEND_CL using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator; #endif @@ -40,22 +42,40 @@ using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator; class AllocatorFacadePrivate; class AllocatorFacade { public: - ~AllocatorFacade(); + using Allocation = pten::Allocation; AllocatorFacade(const AllocatorFacade& o) = delete; const AllocatorFacade& operator=(const AllocatorFacade& o) = delete; + ~AllocatorFacade(); static AllocatorFacade& Instance(); + const std::shared_ptr& GetAllocator(const platform::Place& place); + // Allocate a shared allocation. std::shared_ptr AllocShared(const platform::Place& place, size_t size); - // Allocate a unique allocation. AllocationPtr Alloc(const platform::Place& place, size_t size); - // Release unused memory pool. uint64_t Release(const platform::Place& place); - const std::shared_ptr& GetAllocator(const platform::Place& place); + + std::shared_ptr AllocShared(const platform::Place& place, + size_t size, + const platform::Stream& stream); + + bool InSameStream(const std::shared_ptr& allocation, + const platform::Stream& stream); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // TODO(zhiqiu): change gpuStream_t to platform::Stream if needed. + AllocationPtr Alloc(const platform::Place& place, size_t size, + const gpuStream_t& stream); + uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); + void RecordStream(std::shared_ptr allocation, + const gpuStream_t& stream); + const gpuStream_t& GetStream( + const std::shared_ptr& allocation) const; +#endif #ifdef PADDLE_WITH_CUDA void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index 9f34f5198a..ad62af8480 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -45,7 +45,8 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)), allow_free_idle_chunk_(allow_free_idle_chunk) {} -Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { +pten::Allocation *AutoGrowthBestFitAllocator::AllocateImpl( + size_t unaligned_size) { size_t size = AlignedSize(unaligned_size, alignment_); VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size; @@ -78,11 +79,13 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { size_t realloc_size = std::max(size, chunk_size_); try { - chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size)); + chunks_.emplace_back(static_unique_ptr_cast( + underlying_allocator_->Allocate(realloc_size))); } catch (BadAlloc &ex) { if (FLAGS_free_when_no_cache_hit) throw ex; FreeIdleChunks(); - chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size)); + chunks_.emplace_back(static_unique_ptr_cast( + underlying_allocator_->Allocate(realloc_size))); } auto *chunk = &(*chunks_.rbegin()); @@ -100,11 +103,13 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { VLOG(2) << "Not found and reallocate " << realloc_size << "(" << static_cast(p) << "), and remaining " << remaining_size; } + VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_; return new BlockAllocation(block_it); } -void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { - VLOG(10) << "Free " << allocation->size() << " bytes"; +void AutoGrowthBestFitAllocator::FreeImpl(pten::Allocation *allocation) { + VLOG(10) << "Free " << allocation->size() + << " bytes, ptr = " << allocation->ptr(); std::lock_guard guard(spinlock_); auto block_it = static_cast(allocation)->block_it_; auto &blocks = block_it->chunk_->blocks_; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index d1fa6cce01..94aff93ec5 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -36,9 +36,9 @@ class AutoGrowthBestFitAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - Allocation *AllocateImpl(size_t size) override; + pten::Allocation *AllocateImpl(size_t size) override; - void FreeImpl(Allocation *allocation) override; + void FreeImpl(pten::Allocation *allocation) override; // Release the memory block which is not used in pool. uint64_t ReleaseImpl(const platform::Place &place) override { @@ -64,16 +64,17 @@ class AutoGrowthBestFitAllocator : public Allocator { }; struct Chunk { - explicit Chunk(AllocationPtr allocation) + explicit Chunk(DecoratedAllocationPtr allocation) : allocation_(std::move(allocation)) {} - AllocationPtr allocation_; + DecoratedAllocationPtr allocation_; List blocks_; }; struct BlockAllocation : public Allocation { explicit BlockAllocation(const List::iterator &it) - : Allocation(it->ptr_, it->size_, it->chunk_->allocation_->place()), + : Allocation(it->ptr_, it->chunk_->allocation_->base_ptr(), it->size_, + it->chunk_->allocation_->place()), block_it_(it) {} List::iterator block_it_; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc index 926af8292d..5942fbe730 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc @@ -28,12 +28,12 @@ namespace allocation { class RecordedAllocator : public Allocator { protected: - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { allocated_size_ += size; return new Allocation(malloc(size), size, platform::CPUPlace()); } - void FreeImpl(Allocation *allocation) { + void FreeImpl(pten::Allocation *allocation) { allocated_size_ -= allocation->size(); free(allocation->ptr()); delete allocation; @@ -79,7 +79,7 @@ class LimitedResourceAllocator : public Allocator { size_t AllocatedSize() const { return allocated_size_; } protected: - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { if (allocated_size_ + size > capacity_) { throw BadAlloc("", __FILE__, __LINE__); } @@ -88,7 +88,7 @@ class LimitedResourceAllocator : public Allocator { return new Allocation(malloc(size), size, platform::CPUPlace()); } - void FreeImpl(Allocation *allocation) { + void FreeImpl(pten::Allocation *allocation) { allocated_size_ -= allocation->size(); free(allocation->ptr()); delete allocation; diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/allocation/base_ptr_test.cu index e69de29bb2..5edabfcb9f 100644 --- a/paddle/fluid/memory/allocation/base_ptr_test.cu +++ b/paddle/fluid/memory/allocation/base_ptr_test.cu @@ -0,0 +1,118 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "gtest/gtest.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class CUDAAllocatoionBasePtrTest : public ::testing::Test { + public: + void SetUp() override { + place_ = platform::CUDAPlace(); + alloc_times_ = 100; + batch_size_ = 10; + max_alloc_size_ = platform::GpuMaxAllocSize() / alloc_times_; + random_engine_ = std::default_random_engine(time(NULL)); + dis_ = std::uniform_int_distribution(0, max_alloc_size_); + } + + void OneByOneAllocTest() { + for (size_t i = 0; i < alloc_times_; ++i) { + size_t size = dis_(random_engine_); + AllocationPtr allocation = Alloc(place_, size); + + void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* system_ptr = + platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); + EXPECT_EQ(base_ptr, system_ptr); + } + + Release(place_); + } + + void BatchByBatchAllocTest() { + std::vector allocations; + allocations.reserve(batch_size_); + size_t batch_num = alloc_times_ / batch_size_; + + for (size_t i = 0; i < batch_num; ++i) { + for (size_t j = 0; j < batch_size_; ++j) { + size_t size = dis_(random_engine_); + AllocationPtr allocation = Alloc(place_, size); + + void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* system_ptr = + platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); + EXPECT_EQ(base_ptr, system_ptr); + + allocations.emplace_back(std::move(allocation)); + } + allocations.clear(); + } + + Release(place_); + } + + void ContinuousAllocTest() { + std::vector allocations; + allocations.reserve(alloc_times_); + + for (size_t i = 0; i < alloc_times_; ++i) { + size_t size = dis_(random_engine_); + AllocationPtr allocation = Alloc(place_, size); + + void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* system_ptr = + platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); + EXPECT_EQ(base_ptr, system_ptr); + + allocations.emplace_back(std::move(allocation)); + } + + allocations.clear(); + Release(place_); + } + + void ZeroSizeAllocTest() { + AllocationPtr allocation = Alloc(place_, 0); + void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* system_ptr = + platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); + EXPECT_EQ(base_ptr, system_ptr); + } + + private: + platform::CUDAPlace place_; + size_t max_alloc_size_; + size_t alloc_times_; + size_t batch_size_; + std::default_random_engine random_engine_; + std::uniform_int_distribution dis_; +}; + +TEST_F(CUDAAllocatoionBasePtrTest, base_ptr_test) { + OneByOneAllocTest(); + BatchByBatchAllocTest(); + ContinuousAllocTest(); + ZeroSizeAllocTest(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc index 0955b52126..3cba70bd5b 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -33,7 +33,7 @@ static int HighestBitPos(size_t N) { } } -BestFitAllocator::BestFitAllocator(Allocation* allocation) +BestFitAllocator::BestFitAllocator(pten::Allocation* allocation) : allocation_(allocation) { details::Chunk chunk; chunk.size_ = allocation_->size(); @@ -115,7 +115,7 @@ size_t BestFitAllocator::NumFreeChunks() const { } return num; } -void BestFitAllocator::FreeImpl(Allocation* allocation) { +void BestFitAllocator::FreeImpl(pten::Allocation* allocation) { auto* bf_allocation = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL( bf_allocation, @@ -150,7 +150,7 @@ void BestFitAllocator::FreeImpl(Allocation* allocation) { InsertFreeNode(chunk_it); delete allocation; } -Allocation* BestFitAllocator::AllocateImpl(size_t size) { +pten::Allocation* BestFitAllocator::AllocateImpl(size_t size) { auto highest_set_bit = static_cast(HighestBitPos(size)); MapIt map_it; for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) { diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h index 42f69e6d70..297d876178 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/best_fit_allocator.h @@ -108,7 +108,7 @@ class BestFitAllocation : public Allocation { // the prev-chunk and the next-chunk when possible. class BestFitAllocator : public Allocator { public: - explicit BestFitAllocator(Allocation* allocation); + explicit BestFitAllocator(pten::Allocation* allocation); void* BasePtr() const { return allocation_->ptr(); } @@ -127,11 +127,11 @@ class BestFitAllocator : public Allocator { void InsertFreeNode(const ListIt& it); protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; private: - Allocation* allocation_; // not owned + pten::Allocation* allocation_; // not owned details::ChunkList chunks_; details::FreeChunkBin free_chunks_; }; diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc index 325cb010bf..11739ebba9 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator.cc @@ -46,12 +46,13 @@ void BufferedAllocator::FreeCache(size_t size) { bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; } -void BufferedAllocator::FreeImpl(Allocation *allocation) { +void BufferedAllocator::FreeImpl(pten::Allocation *allocation) { platform::LockGuardPtr guard(mtx_); - allocations_.emplace(allocation->size(), AllocationPtr(allocation)); + allocations_.emplace(allocation->size(), + AllocationPtr(allocation, Allocator::AllocationDeleter)); } -Allocation *BufferedAllocator::AllocateImpl(size_t size) { +pten::Allocation *BufferedAllocator::AllocateImpl(size_t size) { { platform::LockGuardPtr guard(mtx_); auto it = allocations_.lower_bound(size); diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h index 5e1733bd83..0ccccef573 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.h +++ b/paddle/fluid/memory/allocation/buffered_allocator.h @@ -45,8 +45,8 @@ class BufferedAllocator : public Allocator { void FreeCache(size_t size); protected: - void FreeImpl(Allocation *allocation) override; - Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; + pten::Allocation *AllocateImpl(size_t size) override; private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc index 0bfa10a161..21c30efccd 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc @@ -27,7 +27,7 @@ namespace memory { namespace allocation { inline std::unique_ptr GetBufferedAllocator( - Allocation *allocation, bool thread_safe) { + pten::Allocation *allocation, bool thread_safe) { std::unique_ptr allocator(new BestFitAllocator(allocation)); if (thread_safe) { allocator.reset(new LockedAllocator(std::move(allocator))); @@ -68,7 +68,7 @@ class StubAllocator : public Allocator { size_t GetFreeCount() const { return destruct_count_; } protected: - void FreeImpl(Allocation *allocation) override { + void FreeImpl(pten::Allocation *allocation) override { auto *alloc = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL( alloc, platform::errors::InvalidArgument( @@ -77,7 +77,7 @@ class StubAllocator : public Allocator { ++destruct_count_; delete allocation; } - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { ++construct_count_; if (size == 0) { return new StubAllocation(nullptr, 0, platform::CPUPlace()); diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc index 128591f5a8..bf0bd891be 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.cc +++ b/paddle/fluid/memory/allocation/cpu_allocator.cc @@ -24,7 +24,7 @@ namespace allocation { bool CPUAllocator::IsAllocThreadSafe() const { return true; } -void CPUAllocator::FreeImpl(Allocation *allocation) { +void CPUAllocator::FreeImpl(pten::Allocation *allocation) { void *p = allocation->ptr(); #ifdef _WIN32 _aligned_free(p); @@ -34,7 +34,7 @@ void CPUAllocator::FreeImpl(Allocation *allocation) { delete allocation; } -Allocation *CPUAllocator::AllocateImpl(size_t size) { +pten::Allocation *CPUAllocator::AllocateImpl(size_t size) { void *p; #ifdef _WIN32 p = _aligned_malloc(size, kAlignment); diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 058ff63381..a64089dd2d 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -37,8 +37,8 @@ class CPUAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; }; } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index b1a45afa99..ff9bbf4ab3 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -25,34 +25,34 @@ #include #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" namespace paddle { namespace memory { namespace allocation { bool CUDAAllocator::IsAllocThreadSafe() const { return true; } -void CUDAAllocator::FreeImpl(Allocation* allocation) { +void CUDAAllocator::FreeImpl(pten::Allocation* allocation) { PADDLE_ENFORCE_EQ( BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_, platform::errors::PermissionDenied( "GPU memory is freed in incorrect device. This may be a bug")); - platform::RecordedCudaFree(allocation->ptr(), allocation->size(), - place_.device); + platform::RecordedGpuFree(allocation->ptr(), allocation->size(), + place_.device); delete allocation; } -Allocation* CUDAAllocator::AllocateImpl(size_t size) { +pten::Allocation* CUDAAllocator::AllocateImpl(size_t size) { std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); }); void* ptr; - auto result = platform::RecordedCudaMalloc(&ptr, size, place_.device); + auto result = platform::RecordedGpuMalloc(&ptr, size, place_.device); if (LIKELY(result == gpuSuccess)) { return new Allocation(ptr, size, platform::Place(place_)); } size_t avail, total, actual_avail, actual_total; - bool is_limited = platform::RecordedCudaMemGetInfo( + bool is_limited = platform::RecordedGpuMemGetInfo( &avail, &total, &actual_avail, &actual_total, place_.device); size_t allocated = total - avail; diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h index 5969d4d20d..57e85a3dc2 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_allocator.h @@ -28,8 +28,8 @@ class CUDAAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; private: platform::CUDAPlace place_; diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 3d6f1d7bcb..a6696634c1 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -41,8 +41,9 @@ namespace allocation { */ class CUDADeviceContextAllocation : public Allocation { public: - explicit CUDADeviceContextAllocation(AllocationPtr allocation) - : Allocation(allocation->ptr(), allocation->size(), allocation->place()), + explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation) + : Allocation(allocation->ptr(), allocation->base_ptr(), + allocation->size(), allocation->place()), underlying_allocation_(std::move(allocation)) {} ~CUDADeviceContextAllocation() { @@ -55,7 +56,7 @@ class CUDADeviceContextAllocation : public Allocation { << p_allocation; dev_ctx_->AddStreamCallback([p_allocation] { VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation; - AllocationDeleter()(p_allocation); + Allocator::AllocationDeleter(p_allocation); }); } @@ -64,7 +65,7 @@ class CUDADeviceContextAllocation : public Allocation { } private: - AllocationPtr underlying_allocation_; + DecoratedAllocationPtr underlying_allocation_; const platform::CUDADeviceContext *dev_ctx_{nullptr}; }; @@ -81,10 +82,10 @@ class CUDADeviceContextAllocator : public Allocator { : place_(place), default_stream_(default_stream) { platform::CUDADeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( hipEventCreateWithFlags(&event_, hipEventDisableTiming)); #else - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_ENFORCE_GPU_SUCCESS( cudaEventCreate(&event_, cudaEventDisableTiming)); #endif } @@ -93,35 +94,34 @@ class CUDADeviceContextAllocator : public Allocator { if (event_) { platform::CUDADeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_)); #endif } } protected: - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { PADDLE_ENFORCE_NOT_NULL( default_stream_, platform::errors::PreconditionNotMet( "Default stream is not set for CUDADeviceContextAllocator")); platform::CUDADeviceGuard guard(place_.device); - auto allocation = - new CUDADeviceContextAllocation(memory::Alloc(place_, size)); + auto allocation = new CUDADeviceContextAllocation( + static_unique_ptr_cast(memory::Alloc(place_, size))); // Wait for the event on stream #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, default_stream_)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamWaitEvent(default_stream_, event_, 0)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0)); #endif return allocation; } - void FreeImpl(Allocation *allocation) override { delete allocation; } + void FreeImpl(pten::Allocation *allocation) override { delete allocation; } private: platform::CUDAPlace place_; diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc index e69de29bb2..2ae2cf20ee 100644 --- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc @@ -0,0 +1,228 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif + +#include +#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h" +#include "paddle/fluid/platform/enforce.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/dynload/cuda_driver.h" +#endif +#if CUDA_VERSION >= 10020 + +namespace paddle { +namespace memory { +namespace allocation { + +CUDAVirtualMemAllocator::CUDAVirtualMemAllocator( + const platform::CUDAPlace& place) + : place_(place) { + CUmemAllocationProp prop = {}; + + // Setup the properties common for all the chunks + // The allocations will be device pinned memory. + // This property structure describes the physical location where the memory + // will be allocated via cuMemCreate allong with additional properties In this + // case, the allocation will be pinnded device memory local to a given device. + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = place.device; + prop_ = prop; + + // Prepare the access descriptor array indicating where and how the backings + // should be visible. + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { + if (place.device != dev_id) { + int capable = 0; + PADDLE_ENFORCE_GPU_SUCCESS( + cudaDeviceCanAccessPeer(&capable, place.device, dev_id)); + if (!capable) { + VLOG(1) << "device(" << place.device + << ") can not access peer to device(" << dev_id << ")"; + continue; + } + } + CUmemAccessDesc access_desc = {}; + // Specify which device we are adding mappings for. + access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc.location.id = dev_id; + + // Specify both read and write access. + access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + access_desc_.push_back(access_desc); + } + + // Get the minimum granularity needed for all devices + // (the max of the minimum granularity of each participating device) + granularity_ = 0; + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { + size_t granularity; + prop.location.id = dev_id; + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cuMemGetAllocationGranularity( + &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + granularity_ = std::max(granularity, granularity_); + } + + size_t actual_avail, actual_total; + paddle::platform::CUDADeviceGuard guard(place.device); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total)); + + virtual_mem_size_ = AlignedSize(actual_total, granularity_); + + // Reserve the required contiguous virtual address space for the allocations + // The maximum video memory size we can apply for is the video memory size of + // GPU, + // so the virtual address space size we reserve is equal to the GPU video + // memory size + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cuMemAddressReserve( + &virtual_mem_base_, virtual_mem_size_, 0, 0, 0)); + + virtual_mem_alloced_offset_ = 0; +} + +bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; } + +void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) { + PADDLE_ENFORCE_EQ( + BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_, + platform::errors::PermissionDenied( + "GPU memory is freed in incorrect device. This may be a bug")); + + auto iter = virtual_2_physical_map_.find( + reinterpret_cast(allocation->ptr())); + if (iter == virtual_2_physical_map_.end()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Can not find virtual memory address at %s", allocation->ptr())); + } + + int prev_id; + cudaGetDevice(&prev_id); + if (prev_id != place_.device) { + cudaSetDevice(place_.device); + } + + auto result = + paddle::platform::dynload::cuMemUnmap(iter->first, iter->second.second); + if (result != CUDA_ERROR_DEINITIALIZED) { + PADDLE_ENFORCE_GPU_SUCCESS(result); + } + + if (result != CUDA_ERROR_DEINITIALIZED) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::RecordedGpuMemRelease( + iter->second.first, iter->second.second, place_.device)); + } + + if (prev_id != place_.device) { + cudaSetDevice(prev_id); + } + + virtual_2_physical_map_.erase(iter); + + delete allocation; +} + +pten::Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) { + size = AlignedSize(size, granularity_); + + CUdeviceptr ptr = virtual_mem_base_ + virtual_mem_alloced_offset_; + + if (ptr + size > virtual_mem_base_ + virtual_mem_size_) { + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on GPU Virtual Memory %d. " + "Cannot allocate %s memory on GPU Virtual Memory %d, %s memory has " + "been allocated and " + "available memory is only %s.\n\n" + "Please decrease the batch size of your model.\n\n", + place_.device, string::HumanReadableSize(size), place_.device, + string::HumanReadableSize(virtual_mem_alloced_offset_), + string::HumanReadableSize(virtual_mem_size_ - + virtual_mem_alloced_offset_), + place_.device)); + return nullptr; + } + + CUmemGenericAllocationHandle handle; + + paddle::platform::CUDADeviceGuard guard(place_.device); + + // Create physical memory backing allocation. + auto result = + platform::RecordedGpuMemCreate(&handle, size, &prop_, 0, place_.device); + + if (result != CUDA_SUCCESS) { + if (result == CUDA_ERROR_OUT_OF_MEMORY) { + size_t actual_avail, actual_total; + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total)); + size_t actual_allocated = actual_total - actual_avail; + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on GPU %d. " + "Cannot allocate %s memory on GPU %d, %s memory has been allocated " + "and " + "available memory is only %s.\n\n" + "Please check whether there is any other process using GPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n" + "2. If no, please decrease the batch size of your model.\n\n", + place_.device, string::HumanReadableSize(size), place_.device, + string::HumanReadableSize(actual_allocated), + string::HumanReadableSize(actual_avail), place_.device)); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(result); + } + return nullptr; + } + + // Assign the chunk to the appropriate VA range and release the handle. + // After mapping the memory, it can be referenced by virtual address. + // The allocation will be kept live until it is unmapped. + result = paddle::platform::dynload::cuMemMap(ptr, size, 0, handle, 0); + + if (result != CUDA_SUCCESS) { + platform::RecordedGpuMemRelease(handle, size, place_.device); + PADDLE_ENFORCE_GPU_SUCCESS(result); + return nullptr; + } + + // Apply the access descriptors to the whole VA range. + result = paddle::platform::dynload::cuMemSetAccess( + ptr, size, access_desc_.data(), access_desc_.size()); + + if (result != CUDA_SUCCESS) { + paddle::platform::dynload::cuMemUnmap(ptr, size); + platform::RecordedGpuMemRelease(handle, size, place_.device); + PADDLE_ENFORCE_GPU_SUCCESS(result); + return nullptr; + } + + virtual_2_physical_map_.emplace(ptr, std::make_pair(handle, size)); + + virtual_mem_alloced_offset_ += size; + + return new Allocation(reinterpret_cast(ptr), size, + platform::Place(place_)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h index e69de29bb2..0e1e59d200 100644 --- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h @@ -0,0 +1,62 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_CUDA +#include +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif + +#include // NOLINT +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +#if CUDA_VERSION >= 10020 + +namespace paddle { +namespace memory { +namespace allocation { + +// Allocate memory using NVIDIA's virtual memory management technology +class CUDAVirtualMemAllocator : public Allocator { + public: + explicit CUDAVirtualMemAllocator(const platform::CUDAPlace& place); + + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; + + private: + platform::CUDAPlace place_; + + CUdeviceptr virtual_mem_base_; + size_t virtual_mem_size_; + size_t virtual_mem_alloced_offset_; + size_t granularity_; + + CUmemAllocationProp prop_; + std::vector access_desc_; + + std::map> + virtual_2_physical_map_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc index 6e8f870b23..a0c8efddbd 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.cc +++ b/paddle/fluid/memory/allocation/locked_allocator.cc @@ -37,12 +37,12 @@ LockedAllocator::LockedAllocator( } } -void LockedAllocator::FreeImpl(Allocation *allocation) { +void LockedAllocator::FreeImpl(pten::Allocation *allocation) { platform::LockGuardPtr guard(mtx_); underlying_allocator_->Free(allocation); } -Allocation *LockedAllocator::AllocateImpl(size_t size) { +pten::Allocation *LockedAllocator::AllocateImpl(size_t size) { platform::LockGuardPtr guard(mtx_); return underlying_allocator_->Allocate(size).release(); } diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h index 1b8418bc84..d17c8b24e2 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.h +++ b/paddle/fluid/memory/allocation/locked_allocator.h @@ -29,8 +29,8 @@ class LockedAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation *allocation) override; - Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; + pten::Allocation *AllocateImpl(size_t size) override; private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 2c00b34dd1..ffe7ccf919 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -20,9 +20,8 @@ #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" -#include "paddle/fluid/platform/npu_info.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" @@ -30,9 +29,7 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif -#ifdef PADDLE_WITH_XPU -#include "paddle/fluid/platform/xpu/xpu_header.h" -#endif +#include "paddle/fluid/platform/device/device_wrapper.h" PADDLE_DEFINE_EXPORTED_bool( init_allocated_mem, false, @@ -114,30 +111,43 @@ size_t Used(const platform::CPUPlace &place) { return GetCPUBuddyAllocator()->Used(); } +// For Graphcore IPU +template <> +void *Alloc(const platform::IPUPlace &place, size_t size) { + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + VLOG(10) << "IPUPlace, Allocate on cpu."; + + void *p = GetCPUBuddyAllocator()->Alloc(size); + if (FLAGS_init_allocated_mem) { + memset(p, 0xEF, size); + } + VLOG(10) << " pointer=" << p; + return p; +} +template <> +void Free(const platform::IPUPlace &place, void *p, + size_t size) { + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetCPUBuddyAllocator()->Free(p); +} +template <> +uint64_t Release(const platform::IPUPlace &place) { + return GetCPUBuddyAllocator()->Release(); +} +template <> +size_t Used(const platform::IPUPlace &place) { + return GetCPUBuddyAllocator()->Used(); +} + // For kunlun XPU template <> void *Alloc(const platform::XPUPlace &place, size_t size) { #ifdef PADDLE_WITH_XPU VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void *p = nullptr; - int dev_id = -1; - int ret = xpu_current_device(&dev_id); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); - if (dev_id >= 64) { - // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id - dev_id -= 64; - } - ret = xpu_set_device(place.device); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); - ret = xpu_malloc(reinterpret_cast(&p), size); + + platform::XPUDeviceGuard gurad(place.device); + int ret = xpu_malloc(reinterpret_cast(&p), size); if (ret != XPU_SUCCESS) { std::cout << "xpu memory malloc(" << size << ") failed, try again\n"; xpu_wait(); @@ -151,12 +161,6 @@ void *Alloc(const platform::XPUPlace &place, size_t size) { PADDLE_THROW(platform::errors::Unimplemented( "xpu memory FLAGS_init_allocated_mem is not implemented.")); } - ret = xpu_set_device(dev_id); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); VLOG(10) << " pointer=" << p; return p; #else @@ -172,30 +176,9 @@ void Free(const platform::XPUPlace &place, void *p, #ifdef PADDLE_WITH_XPU VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); - int dev_id = -1; - int ret = xpu_current_device(&dev_id); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); - if (dev_id >= 64) { - // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id - dev_id -= 64; - } - ret = xpu_set_device(place.device); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); + + platform::XPUDeviceGuard gurad(place.device); xpu_free(p); - ret = xpu_set_device(dev_id); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); #else PADDLE_THROW( platform::errors::PermissionDenied("'XPUPlace' is not supported.")); @@ -327,8 +310,8 @@ void *Alloc(const platform::NPUPlace &place, size_t size) { size_t avail, total; platform::NPUMemoryUsage(&avail, &total); PADDLE_THROW(platform::errors::ResourceExhausted( - "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize " - "%s, GpuMaxChunkSize %s, GPU memory used: %s.", + "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize " + "%s, NpuMaxChunkSize %s, NPU memory used: %s.", string::HumanReadableSize(size), place.device, string::HumanReadableSize(avail), string::HumanReadableSize(total), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), @@ -336,7 +319,7 @@ void *Alloc(const platform::NPUPlace &place, size_t size) { string::HumanReadableSize(Used(place)))); } else { if (FLAGS_init_allocated_mem) { - aclrtMemset(ptr, size, 0xEF, size); + platform::NPUMemsetSync(ptr, 0xEF, size, size); } } VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); @@ -387,8 +370,7 @@ void *Alloc(const platform::NPUPinnedPlace &place, void *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { - LOG(WARNING) << "aclrtMallocHost Cannot allocate " << size - << " bytes in NPUPinnedPlace"; + LOG(WARNING) << "Cannot allocate " << size << " bytes in NPUPinnedPlace"; } if (FLAGS_init_allocated_mem) { memset(ptr, 0xEF, size); @@ -622,6 +604,134 @@ uint64_t Release( #endif } +// For MLU +#ifdef PADDLE_WITH_MLU +class MLUBuddyAllocatorList { + private: + MLUBuddyAllocatorList() : devices_(platform::GetMLUSelectedDevices()) { + auto mlu_num = devices_.size(); + allocators_.resize(mlu_num); + init_flags_.reserve(mlu_num); + for (size_t i = 0; i < mlu_num; ++i) { + init_flags_.emplace_back(new std::once_flag()); + } + } + + static MLUBuddyAllocatorList *CreateNewInstance() { + return new MLUBuddyAllocatorList(); + } + + public: + static MLUBuddyAllocatorList *Instance() { + static auto *instance = CreateNewInstance(); + return instance; + } + + BuddyAllocator *Get(int mlu_id) { + auto pos = std::distance( + devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id)); + PADDLE_ENFORCE_LT(pos, devices_.size(), + platform::errors::OutOfRange( + "The index exceeds the size of devices, the size of " + "devices is %d, the index is %d", + devices_.size(), pos)); + + std::call_once(*init_flags_[pos], [this, pos] { + platform::SetMLUDeviceId(devices_[pos]); + allocators_[pos].reset(new BuddyAllocator( + std::unique_ptr( + new detail::MLUAllocator(devices_[pos])), + platform::MLUMinChunkSize(), platform::MLUMaxChunkSize())); + VLOG(10) << "\n\nNOTE:\n" + << "You can set GFlags environment variable " + << "(mlu reuse gpu GFlags) " + << "'FLAGS_fraction_of_gpu_memory_to_use' " + << "or 'FLAGS_initial_gpu_memory_in_mb' " + << "or 'FLAGS_reallocate_gpu_memory_in_mb' " + << "to change the memory size for MLU usage.\n" + << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is " + << FLAGS_fraction_of_gpu_memory_to_use + << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is " + << FLAGS_initial_gpu_memory_in_mb + << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " + << FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; + }); + + return allocators_[pos].get(); + } + + private: + std::vector devices_; + std::vector> init_flags_; + std::vector> allocators_; +}; + +BuddyAllocator *GetMLUBuddyAllocator(int mlu_id) { + return MLUBuddyAllocatorList::Instance()->Get(mlu_id); +} +#endif + +template <> +size_t Used(const platform::MLUPlace &place) { +#ifdef PADDLE_WITH_MLU + return GetMLUBuddyAllocator(place.device)->Used(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'MLUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void *Alloc(const platform::MLUPlace &place, size_t size) { +#ifdef PADDLE_WITH_MLU + auto *buddy_allocator = GetMLUBuddyAllocator(place.device); + auto *ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + platform::MLUDeviceGuard(place.device); + size_t avail = 0, total = 0; + platform::MLUMemoryUsage(&avail, &total); + PADDLE_THROW(platform::errors::ResourceExhausted( + "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize " + "%s, MLUMinChunkSize %s, MLU memory used: %s.", + string::HumanReadableSize(size), place.device, + string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), + string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), + string::HumanReadableSize(Used(place)))); + } else { + if (FLAGS_init_allocated_mem) { + cnrtMemset(ptr, 0xEF, size); + } + } + return ptr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'MLUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void Free(const platform::MLUPlace &place, void *p, + size_t size) { +#ifdef PADDLE_WITH_MLU + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetMLUBuddyAllocator(place.device)->Free(p); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'MLUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +uint64_t Release(const platform::MLUPlace &place) { +#ifdef PADDLE_WITH_MLU + return GetMLUBuddyAllocator(place.device)->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'MLUPlace' is not supported in CPU only device.")); +#endif +} + struct AllocVisitor : public boost::static_visitor { inline explicit AllocVisitor(size_t size) : size_(size) {} @@ -680,7 +790,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { namespace allocation { -Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { +pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); auto *tmp_alloc = new Allocation(ptr, size, place_); platform::MemEvenRecorder::Instance().PushMemRecord( @@ -688,7 +798,7 @@ Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { return tmp_alloc; } -void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) { +void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) { boost::apply_visitor( legacy::FreeVisitor(allocation->ptr(), allocation->size()), allocation->place()); diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h index 474a308a06..b7b3647ff9 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h @@ -34,8 +34,8 @@ class NaiveBestFitAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - Allocation *AllocateImpl(size_t size) override; - void FreeImpl(Allocation *allocation) override; + pten::Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; uint64_t ReleaseImpl(const platform::Place &place) override; private: diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc index faf7ae6221..d9fa7ec27f 100644 --- a/paddle/fluid/memory/allocation/npu_allocator.cc +++ b/paddle/fluid/memory/allocation/npu_allocator.cc @@ -14,15 +14,15 @@ #include "paddle/fluid/memory/allocation/npu_allocator.h" #include +#include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/npu_info.h" namespace paddle { namespace memory { namespace allocation { bool NPUAllocator::IsAllocThreadSafe() const { return true; } -void NPUAllocator::FreeImpl(Allocation* allocation) { +void NPUAllocator::FreeImpl(pten::Allocation* allocation) { PADDLE_ENFORCE_EQ( BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_, platform::errors::PermissionDenied( @@ -32,7 +32,7 @@ void NPUAllocator::FreeImpl(Allocation* allocation) { delete allocation; } -Allocation* NPUAllocator::AllocateImpl(size_t size) { +pten::Allocation* NPUAllocator::AllocateImpl(size_t size) { std::call_once(once_flag_, [this] { platform::SetNPUDeviceId(place_.device); }); diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h index bf66897350..88b0c9a24b 100644 --- a/paddle/fluid/memory/allocation/npu_allocator.h +++ b/paddle/fluid/memory/allocation/npu_allocator.h @@ -28,8 +28,8 @@ class NPUAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; private: platform::NPUPlace place_; diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc index 9178825efa..2389973fa9 100644 --- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc @@ -23,22 +23,22 @@ void NPUPinnedAllocator::ProcessEventsAndFree() { for (auto it = npu_events_.begin(); it != npu_events_.end();) { aclrtEvent event = it->second; aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; - PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status)); + platform::NPUEventQuery(event, &status); if (status == ACL_EVENT_STATUS_COMPLETE) { - Allocation *allocation = it->first; + auto *allocation = it->first; void *ptr = allocation->ptr(); free(ptr); npu_events_.erase(it++); delete allocation; - PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event)); + platform::NPUEventDestroy(event); } else { ++it; } } } -Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) { +pten::Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) { std::lock_guard lock(mtx_); ProcessEventsAndFree(); void *ptr; @@ -50,18 +50,29 @@ Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) { return new Allocation(ptr, size, platform::NPUPinnedPlace()); } -void NPUPinnedAllocator::FreeImpl(Allocation *allocation) { +void NPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) { std::lock_guard lock(mtx_); void *ptr = allocation->ptr(); auto iter = npu_events_.find(allocation); + + // Managed by GC if not called RecordEvent. + if (iter == npu_events_.end()) { + // double free? No such problem has been found so far. + // Or maybe we need a set to record which + // Allocation managed by GC. + free(ptr); + delete allocation; + return; + } + aclrtEvent event = iter->second; aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; - PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status)); + platform::NPUEventQuery(event, &status); if (status == ACL_EVENT_STATUS_COMPLETE) { free(ptr); npu_events_.erase(allocation); delete allocation; - PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event)); + platform::NPUEventDestroy(event); } return; } @@ -72,12 +83,12 @@ uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) { return static_cast(0); } -void NPUPinnedAllocator::RecordEvent(Allocation *allocation, +void NPUPinnedAllocator::RecordEvent(pten::Allocation *allocation, aclrtStream stream) { std::lock_guard lock(mtx_); aclrtEvent event = nullptr; - PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event)); - PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(event, stream)); + platform::NPUEventCreate(&event); + platform::NPUEventRecord(event, stream); npu_events_.insert({allocation, event}); } diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h index b330b6e352..716b12eea1 100644 --- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h +++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h @@ -21,7 +21,7 @@ #include "acl/acl.h" #include "paddle/fluid/memory/allocation/allocator.h" -#include "paddle/fluid/platform/npu_info.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -32,16 +32,16 @@ class NPUPinnedAllocator : public Allocator { public: bool IsAllocThreadSafe() const override { return true; } void ProcessEventsAndFree(); - void RecordEvent(Allocation *allocation, aclrtStream stream); + void RecordEvent(pten::Allocation *allocation, aclrtStream stream); constexpr static size_t kAlignment = 4096UL; protected: - Allocation *AllocateImpl(size_t size) override; - void FreeImpl(Allocation *allocation) override; + pten::Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; uint64_t ReleaseImpl(const platform::Place &place) override; private: - std::unordered_map npu_events_; + std::unordered_map npu_events_; mutable std::mutex mtx_; }; diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 5aa0514432..f1175fc437 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -18,20 +18,20 @@ namespace paddle { namespace memory { namespace allocation { bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } -void CPUPinnedAllocator::FreeImpl(Allocation *allocation) { +void CPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipHostFree(allocation->ptr())); + PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr())); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(allocation->ptr())); + PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr())); #endif delete allocation; } -Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { +pten::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { void *ptr; #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable)); + PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); #endif return new Allocation(ptr, size, platform::CUDAPinnedPlace()); } diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index 4f535ef337..800e3ff3bb 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -25,8 +25,8 @@ class CPUPinnedAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation *allocation) override; - Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; + pten::Allocation *AllocateImpl(size_t size) override; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index 1607af3808..856b6c2e9a 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -39,7 +39,7 @@ class WaitedAllocateSizeGuard { size_t requested_size_; }; -void RetryAllocator::FreeImpl(Allocation* allocation) { +void RetryAllocator::FreeImpl(pten::Allocation* allocation) { // Delete underlying allocation first. size_t size = allocation->size(); underlying_allocator_->Free(allocation); @@ -51,7 +51,7 @@ void RetryAllocator::FreeImpl(Allocation* allocation) { } } -Allocation* RetryAllocator::AllocateImpl(size_t size) { +pten::Allocation* RetryAllocator::AllocateImpl(size_t size) { auto alloc_func = [&, this]() { return underlying_allocator_->Allocate(size).release(); }; diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 031a5e2b97..b427a37907 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -45,8 +45,8 @@ class RetryAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - void FreeImpl(Allocation* allocation) override; - Allocation* AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation* allocation) override; + pten::Allocation* AllocateImpl(size_t size) override; uint64_t ReleaseImpl(const platform::Place& place) override { return underlying_allocator_->Release(place); } diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index 787f3d9dca..d636c73e07 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -98,12 +98,12 @@ class DummyAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( "Here is a test exception, always BadAlloc.")); } - void FreeImpl(Allocation *) override {} + void FreeImpl(pten::Allocation *) override {} }; TEST(RetryAllocator, RetryAllocatorLastAllocFailure) { diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index e69de29bb2..05c6a7adaf 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -0,0 +1,201 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +StreamSafeCUDAAllocation::StreamSafeCUDAAllocation( + DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream) + : Allocation(underlying_allocation->ptr(), + underlying_allocation->base_ptr(), + underlying_allocation->size(), underlying_allocation->place()), + underlying_allocation_(std::move(underlying_allocation)), + owning_stream_(std::move(owning_stream)) {} + +void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) { + VLOG(8) << "Try record stream " << stream << " for address " << ptr(); + if (stream == owning_stream_) { + VLOG(9) << "Record the same stream of " << stream; + return; + } + + std::lock_guard lock_guard(outstanding_event_map_lock_); + gpuEvent_t record_event; + auto it = outstanding_event_map_.find(stream); + if (it == outstanding_event_map_.end()) { + gpuEvent_t new_event; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS( + cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); +#endif + outstanding_event_map_[stream] = new_event; + record_event = new_event; + VLOG(9) << "Create a new event " << new_event; + } else { + record_event = it->second; + VLOG(9) << "Reuse event " << record_event; + } + +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); +#endif + VLOG(8) << "Record event " << record_event << " to stream " << stream; +} + +bool StreamSafeCUDAAllocation::CanBeFreed() { + // NOTE(Ruibiao): This function will not execute concurrently, + // so outstanding_event_lock_ is not required here + for (auto it = outstanding_event_map_.begin(); + it != outstanding_event_map_.end(); ++it) { + gpuEvent_t& event = it->second; +#ifdef PADDLE_WITH_CUDA + gpuError_t err = cudaEventQuery(event); + if (err == cudaErrorNotReady) { + VLOG(9) << "Event " << event << " for " << ptr() << " is not completed"; + // Erase the completded event before "it" + outstanding_event_map_.erase(outstanding_event_map_.begin(), it); + return false; + } + PADDLE_ENFORCE_GPU_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event)); +#else + gpuError_t err = hipEventQuery(event); + if (err == hipErrorNotReady) { + VLOG(9) << "Event " << event << " for " << ptr() << " is not completed"; + // Erase the completded event before "it" + outstanding_event_map_.erase(outstanding_event_map_.begin(), it); + return false; + } + PADDLE_ENFORCE_GPU_SUCCESS(err); + PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event)); +#endif + VLOG(8) << "Destroy event " << event; + } + return true; +} + +const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const { + return owning_stream_; +} + +StreamSafeCUDAAllocator::StreamSafeCUDAAllocator( + std::shared_ptr underlying_allocator, platform::CUDAPlace place, + gpuStream_t default_stream) + : underlying_allocator_(std::move(underlying_allocator)), + place_(std::move(place)), + default_stream_(std::move(default_stream)) { + std::lock_guard lock_guard(allocator_map_lock_); + allocator_map_[place].emplace_back(this); +} + +StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { + std::lock_guard lock_guard(allocator_map_lock_); + std::vector& allocators = allocator_map_[place_]; + allocators.erase(std::remove(allocators.begin(), allocators.end(), this), + allocators.end()); +} + +bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } + +pten::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { + ProcessUnfreedAllocations(); + VLOG(8) << "Try allocate " << size << " bytes"; + AllocationPtr underlying_allocation; + try { + underlying_allocation = underlying_allocator_->Allocate(size); + } catch (BadAlloc&) { + VLOG(4) << "Allocation failed when allocating " << size << " bytes"; + ReleaseImpl(place_); + try { + underlying_allocation = underlying_allocator_->Allocate(size); + } catch (...) { + VLOG(3) + << "Still allocation failed after release memory from all streams"; + throw; + } + } catch (...) { + throw; + } + StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation( + static_unique_ptr_cast(std::move(underlying_allocation)), + default_stream_); + VLOG(8) << "Allocate " << allocation->size() << " bytes at address " + << allocation->ptr(); + return allocation; +} + +void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) { + StreamSafeCUDAAllocation* stream_safe_cuda_allocation = + dynamic_cast(allocation); + PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation, + platform::errors::InvalidArgument( + "Failed to dynamic cast %p from Allocation* to " + "StreamSafeCUDAAllocation*", + allocation)); + VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr(); + std::lock_guard lock_guard(unfreed_allocation_lock_); + if (stream_safe_cuda_allocation->CanBeFreed()) { + VLOG(9) << "Directly delete allocation"; + delete stream_safe_cuda_allocation; + } else { + VLOG(9) << "Put into unfreed_allocation list"; + unfreed_allocations_.emplace_back(stream_safe_cuda_allocation); + } +} + +uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { + std::lock_guard lock_guard(allocator_map_lock_); + std::vector& allocators = + allocator_map_[BOOST_GET_CONST(platform::CUDAPlace, place)]; + uint64_t released_size = 0; + for (StreamSafeCUDAAllocator* allocator : allocators) { + released_size += allocator->ProcessUnfreedAllocationsWithRelease(); + } + VLOG(8) << "Release " << released_size << " bytes memory from all streams"; + return released_size; +} + +void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() { + std::lock_guard lock_guard(unfreed_allocation_lock_); + for (auto it = unfreed_allocations_.begin(); + it != unfreed_allocations_.end();) { + if ((*it)->CanBeFreed()) { + delete *it; + it = unfreed_allocations_.erase(it); + } else { + ++it; + } + } +} + +uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() { + ProcessUnfreedAllocations(); + return underlying_allocator_->Release(place_); +} + +std::map> + StreamSafeCUDAAllocator::allocator_map_; +SpinLock StreamSafeCUDAAllocator::allocator_map_lock_; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index e69de29bb2..f54cdc7496 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -0,0 +1,81 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/allocation/spin_lock.h" +#include "paddle/fluid/platform/place.h" + +#ifdef PADDLE_WITH_CUDA +#include +#else +#include +#endif + +namespace paddle { +namespace memory { +namespace allocation { + +class StreamSafeCUDAAllocation : public Allocation { + public: + StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation, + gpuStream_t owning_stream); + void RecordStream(const gpuStream_t &stream); + bool CanBeFreed(); + + const gpuStream_t &GetOwningStream() const; + + private: + DecoratedAllocationPtr underlying_allocation_; + std::map outstanding_event_map_; + gpuStream_t owning_stream_; + SpinLock outstanding_event_map_lock_; +}; + +class StreamSafeCUDAAllocator : public Allocator { + public: + StreamSafeCUDAAllocator(std::shared_ptr underlying_allocator, + platform::CUDAPlace place, + gpuStream_t default_stream); + ~StreamSafeCUDAAllocator(); + bool IsAllocThreadSafe() const override; + + protected: + pten::Allocation *AllocateImpl(size_t size) override; + void FreeImpl(pten::Allocation *allocation) override; + uint64_t ReleaseImpl(const platform::Place &place) override; + + private: + void ProcessUnfreedAllocations(); + uint64_t ProcessUnfreedAllocationsWithRelease(); + + static std::map> + allocator_map_; + static SpinLock allocator_map_lock_; + + std::shared_ptr underlying_allocator_; + platform::CUDAPlace place_; + gpuStream_t default_stream_; + std::list unfreed_allocations_; + SpinLock unfreed_allocation_lock_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/test_aligned_allocator.cc b/paddle/fluid/memory/allocation/test_aligned_allocator.cc index 3eb1f140ed..987c7ea772 100644 --- a/paddle/fluid/memory/allocation/test_aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/test_aligned_allocator.cc @@ -32,12 +32,12 @@ struct StubAllocator : public Allocator { size_t AllocNum() const { return alloc_num_; } protected: - Allocation *AllocateImpl(size_t size) override { + pten::Allocation *AllocateImpl(size_t size) override { ++alloc_num_; return new Allocation(new uint8_t[size], size, platform::CPUPlace()); } - void FreeImpl(Allocation *allocation) override { + void FreeImpl(pten::Allocation *allocation) override { delete[] static_cast(allocation->ptr()); delete allocation; --alloc_num_; diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h index 654fb3fe7b..9c93065170 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.h +++ b/paddle/fluid/memory/allocation/thread_local_allocator.h @@ -20,7 +20,7 @@ #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { namespace memory { @@ -83,11 +83,11 @@ class ThreadLocalCUDAAllocator : public Allocator { bool IsAllocThreadSafe() const override { return true; } protected: - Allocation* AllocateImpl(size_t size) override { + pten::Allocation* AllocateImpl(size_t size) override { return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl( size); } - void FreeImpl(Allocation* allocation) override { + void FreeImpl(pten::Allocation* allocation) override { auto* tl_allocation = static_cast(allocation); auto allocator_impl = tl_allocation->GetAllocator(); allocator_impl->FreeImpl(tl_allocation); diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc index e69de29bb2..face27debe 100644 --- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc @@ -0,0 +1,256 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/memory/allocation/aligned_allocator.h" +#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +bool NeedSplit(size_t block_size, size_t alignment, size_t allock_size) { + return block_size > (allock_size * 2) || + (block_size - allock_size) > alignment; +} + +VirtualMemoryAutoGrowthBestFitAllocator:: + VirtualMemoryAutoGrowthBestFitAllocator( + const std::shared_ptr &underlying_allocator, + size_t alignment, const platform::CUDAPlace &place) + : underlying_allocator_( + std::make_shared(underlying_allocator, alignment)), + alignment_(alignment), + place_(place) {} + +pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl( + size_t size) { + std::lock_guard guard(spinlock_); + size = AlignedSize(size, alignment_); + auto result = AllocFromFreeBlocks(size); + + if (!result) { + ExtendAndMerge(size); + result = AllocFromFreeBlocks(size); + } + + return result; +} + +void VirtualMemoryAutoGrowthBestFitAllocator::FreeImpl( + pten::Allocation *allocation) { + std::lock_guard guard(spinlock_); + auto block_it = static_cast(allocation)->block_it_; + TryMergeBlock2Blocks(block_it); + delete allocation; +} + +void VirtualMemoryAutoGrowthBestFitAllocator::TryMergeBlock2Blocks( + std::list::iterator block) { + if (block->ptr_ == all_blocks_.front().ptr_ && + block->ptr_ == all_blocks_.back().ptr_) { + block->is_free_ = true; + free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block); + } else if (block->ptr_ == all_blocks_.front().ptr_) { + auto next = std::next(block); + if (next->is_free_ && + reinterpret_cast(block->ptr_) + block->size_ == next->ptr_) { + // merge with next + block->size_ += next->size_; + block->is_free_ = true; + free_blocks_.erase(std::make_pair(next->size_, next->ptr_)); + all_blocks_.erase(next); + free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block); + } else { + block->is_free_ = true; + free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block); + } + } else if (block->ptr_ == all_blocks_.back().ptr_) { + auto pre = std::prev(block); + if (pre->is_free_ && + reinterpret_cast(pre->ptr_) + pre->size_ == block->ptr_) { + // merge with pre + free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_)); + pre->size_ += block->size_; + all_blocks_.erase(block); + free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre); + } else { + block->is_free_ = true; + free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block); + } + } else { + auto pre = std::prev(block); + auto next = std::next(block); + if (pre->is_free_ && + reinterpret_cast(pre->ptr_) + pre->size_ == block->ptr_ && + !(next->is_free_ && + reinterpret_cast(block->ptr_) + block->size_ == + next->ptr_)) { + // merge with pre + free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_)); + pre->size_ += block->size_; + all_blocks_.erase(block); + free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre); + } else if (next->is_free_ && + reinterpret_cast(block->ptr_) + block->size_ == + next->ptr_ && + !(pre->is_free_ && + reinterpret_cast(pre->ptr_) + pre->size_ == + block->ptr_)) { + // merge with next + block->size_ += next->size_; + block->is_free_ = true; + free_blocks_.erase(std::make_pair(next->size_, next->ptr_)); + all_blocks_.erase(next); + free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block); + } else if (pre->is_free_ && + reinterpret_cast(pre->ptr_) + pre->size_ == + block->ptr_ && + next->is_free_ && + reinterpret_cast(block->ptr_) + block->size_ == + next->ptr_) { + // merge with pre and next + free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_)); + free_blocks_.erase(std::make_pair(next->size_, next->ptr_)); + pre->size_ += (block->size_ + next->size_); + all_blocks_.erase(block); + all_blocks_.erase(next); + free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre); + } else { + block->is_free_ = true; + free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block); + } + } +} + +void VirtualMemoryAutoGrowthBestFitAllocator::ExtendAndMerge(size_t size) { + void *ptr = nullptr; + + auto allocateptr = underlying_allocator_->Allocate(size); + ptr = allocateptr->ptr(); + size = allocateptr->size(); + allocations_.push_back(std::move(allocateptr)); // hold allocation + + if (all_blocks_.empty()) { + all_blocks_.push_back(Block(ptr, size, true)); + free_blocks_.emplace(std::make_pair(size, ptr), all_blocks_.begin()); + return; + } + for (auto block_it = all_blocks_.begin(); block_it != all_blocks_.end(); + ++block_it) { + if (block_it->ptr_ > ptr) { + if (block_it == all_blocks_.begin()) { + // insert to front + if (block_it->is_free_ && + reinterpret_cast(ptr) + size == block_it->ptr_) { + // merge with next + free_blocks_.erase(std::make_pair(block_it->size_, block_it->ptr_)); + block_it->ptr_ = ptr; + block_it->size_ += size; + free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_), + block_it); + } else { + // do not merge + all_blocks_.push_front(Block(ptr, size, true)); + free_blocks_.emplace(std::make_pair(size, ptr), all_blocks_.begin()); + } + } else { + // insert to middle + auto next = block_it; + auto pre = std::prev(block_it); + if (pre->is_free_ && + reinterpret_cast(pre->ptr_) + pre->size_ == ptr && + !(next->is_free_ && + reinterpret_cast(ptr) + size == next->ptr_)) { + // merge with pre + free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_)); + pre->size_ += size; + free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre); + } else if (next->is_free_ && + reinterpret_cast(ptr) + size == next->ptr_ && + !(pre->is_free_ && + reinterpret_cast(pre->ptr_) + pre->size_ == + ptr)) { + // merge with next + free_blocks_.erase(std::make_pair(next->size_, next->ptr_)); + next->ptr_ = ptr; + next->size_ += size; + free_blocks_.emplace(std::make_pair(next->size_, next->ptr_), next); + } else if (pre->is_free_ && + reinterpret_cast(pre->ptr_) + pre->size_ == ptr && + next->is_free_ && + reinterpret_cast(ptr) + size == next->ptr_) { + // merge with pre and next + free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_)); + free_blocks_.erase(std::make_pair(next->size_, next->ptr_)); + pre->size_ += (size + next->size_); + free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre); + all_blocks_.erase(next); + } else { + // do not merge + auto iter = all_blocks_.insert(next, Block(ptr, size, true)); + free_blocks_.emplace(std::make_pair(size, ptr), iter); + } + } + return; + } + } + + // insert to back + auto block_it = all_blocks_.end(); + block_it--; + if (block_it->is_free_ && + reinterpret_cast(block_it->ptr_) + block_it->size_ == ptr) { + // merge with pre + free_blocks_.erase(std::make_pair(block_it->size_, block_it->ptr_)); + block_it->size_ += size; + free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_), + block_it); + } else { + // do not merge + all_blocks_.push_back(Block(ptr, size, true)); + auto block_it = all_blocks_.end(); + block_it--; + free_blocks_.emplace(std::make_pair(size, ptr), block_it); + } +} + +pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks( + size_t size) { + auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); + if (iter != free_blocks_.end()) { + std::list::iterator block_it = iter->second; + free_blocks_.erase(iter); + if (NeedSplit(block_it->size_, alignment_, size)) { + size_t remaining_size = block_it->size_ - size; + auto remaining_free_block = all_blocks_.insert( + block_it, Block(block_it->ptr_, remaining_size, true)); + free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_), + remaining_free_block); + block_it->ptr_ = + reinterpret_cast(block_it->ptr_) + remaining_size; + block_it->size_ = size; + } + + block_it->is_free_ = false; + return new BlockAllocation(block_it, place_); + } + + return nullptr; +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h index e69de29bb2..10bf0bbf49 100644 --- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h @@ -0,0 +1,84 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/allocation/spin_lock.h" + +namespace paddle { +namespace memory { +namespace allocation { + +struct Block { + Block(void *ptr, size_t size, bool is_free) + : ptr_(ptr), size_(size), is_free_(is_free) {} + + void *ptr_; + size_t size_; + bool is_free_; +}; + +struct BlockAllocation : public Allocation { + explicit BlockAllocation(const std::list::iterator &it, + platform::Place place) + : Allocation(it->ptr_, it->size_, place), block_it_(it) {} + + std::list::iterator block_it_; +}; + +/** + * Like AutoGrowthBestFitAllocator, VirtualMemoryAutoGrowthBestFitAllocator will + * gradually apply to GPU for video memory as the model uses more video memory. + * However, the difference is that VirtualMemoryAutoGrowthBestFitAllocator uses + * nviaid's virtual memory management technology and obtains the virtual memory + * address. If the video memory applied for twice is continuous, we can combine + * the two video memories later. This combination can greatly reduce + * fragmentation. + */ +class VirtualMemoryAutoGrowthBestFitAllocator : public Allocator { + public: + VirtualMemoryAutoGrowthBestFitAllocator( + const std::shared_ptr &underlying_allocator, size_t alignment, + const platform::CUDAPlace &place); + + bool IsAllocThreadSafe() const override { return true; } + + protected: + pten::Allocation *AllocateImpl(size_t size) override; + + void FreeImpl(pten::Allocation *allocation) override; + + private: + pten::Allocation *AllocFromFreeBlocks(size_t size); + void ExtendAndMerge(size_t size); + void TryMergeBlock2Blocks(std::list::iterator iter); + + std::shared_ptr underlying_allocator_; + size_t alignment_; + + std::map, std::list::iterator> free_blocks_; + std::list all_blocks_; + std::list allocations_; + platform::Place place_; + SpinLock spinlock_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 3b8d07548e..8830c46a17 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/stream/stream.h" namespace paddle { @@ -27,7 +28,7 @@ class DeviceContext; namespace memory { -using allocation::Allocation; +using pten::Allocation; using allocation::Allocator; using allocation::AllocationPtr; @@ -40,5 +41,24 @@ extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size); extern uint64_t Release(const platform::Place& place); +extern std::shared_ptr AllocShared(const platform::Place& place, + size_t size, + const platform::Stream& stream); + +extern bool InSameStream(const std::shared_ptr& allocation, + const platform::Stream& stream); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const gpuStream_t& stream); + +extern uint64_t Release(const platform::CUDAPlace& place, + const gpuStream_t& stream); + +void RecordStream(std::shared_ptr allocation, + const gpuStream_t& stream); + +const gpuStream_t& GetStream(const std::shared_ptr& allocation); +#endif } // namespace memory } // namespace paddle diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index b9481f1c8e..6892f7ce4e 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -18,7 +18,8 @@ limitations under the License. */ #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -286,10 +287,11 @@ class ConcatFunctor { const T** dev_ins_data = nullptr; if (!has_same_shape || in_num < 2 || in_num > 4) { tmp_dev_ins_data = memory::Alloc(context, in_num * sizeof(T*)); + auto* restored = + platform::RestoreHostMemIfCapturingCUDAGraph(inputs_data, in_num); memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_ins_data->ptr(), platform::CPUPlace(), - static_cast(inputs_data), in_num * sizeof(T*), - context.stream()); + tmp_dev_ins_data->ptr(), platform::CPUPlace(), restored, + in_num * sizeof(T*), context.stream()); dev_ins_data = reinterpret_cast(tmp_dev_ins_data->ptr()); } @@ -313,9 +315,11 @@ class ConcatFunctor { } else { auto tmp_dev_ins_col_data = memory::Alloc(context, inputs_col_num * sizeof(int64_t)); + + auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph( + inputs_col, inputs_col_num); memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), - static_cast(inputs_col), + tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored, inputs_col_num * sizeof(int64_t), context.stream()); int64_t* dev_ins_col_data = static_cast(tmp_dev_ins_col_data->ptr()); @@ -332,9 +336,8 @@ class ConcatFunctor { auto* data_alloc_released = data_alloc.release(); auto* col_alloc_released = col_alloc.release(); context.AddStreamCallback([data_alloc_released, col_alloc_released] { - memory::allocation::AllocationDeleter deleter; - deleter(data_alloc_released); - deleter(col_alloc_released); + memory::allocation::Allocator::AllocationDeleter(data_alloc_released); + memory::allocation::Allocator::AllocationDeleter(col_alloc_released); }); #endif } @@ -415,10 +418,11 @@ class SplitFunctor { T** dev_out_gpu_data = nullptr; if (!has_same_shape || o_num < 2 || o_num > 4) { tmp_dev_outs_data = memory::Alloc(context, o_num * sizeof(T*)); + auto* restored = + platform::RestoreHostMemIfCapturingCUDAGraph(outputs_data, o_num); memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_outs_data->ptr(), platform::CPUPlace(), - reinterpret_cast(outputs_data), o_num * sizeof(T*), - context.stream()); + tmp_dev_outs_data->ptr(), platform::CPUPlace(), restored, + o_num * sizeof(T*), context.stream()); dev_out_gpu_data = reinterpret_cast(tmp_dev_outs_data->ptr()); } @@ -442,9 +446,10 @@ class SplitFunctor { } else { auto tmp_dev_ins_col_data = memory::Alloc(context, outputs_cols_num * sizeof(int64_t)); + auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph( + outputs_cols, outputs_cols_num); memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), - tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), - reinterpret_cast(outputs_cols), + tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored, outputs_cols_num * sizeof(int64_t), context.stream()); int64_t* dev_outs_col_data = reinterpret_cast(tmp_dev_ins_col_data->ptr()); @@ -460,9 +465,8 @@ class SplitFunctor { auto* data_alloc_released = data_alloc.release(); auto* cols_alloc_released = cols_alloc.release(); context.AddStreamCallback([data_alloc_released, cols_alloc_released] { - memory::allocation::AllocationDeleter deleter; - deleter(data_alloc_released); - deleter(cols_alloc_released); + memory::allocation::Allocator::AllocationDeleter(data_alloc_released); + memory::allocation::Allocator::AllocationDeleter(cols_alloc_released); }); #endif } diff --git a/paddle/fluid/platform/device/mlu/device_context_allocator.h b/paddle/fluid/platform/device/mlu/device_context_allocator.h index e69de29bb2..408016c0f0 100644 --- a/paddle/fluid/platform/device/mlu/device_context_allocator.h +++ b/paddle/fluid/platform/device/mlu/device_context_allocator.h @@ -0,0 +1,162 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/device/mlu/device_context.h" +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { + +namespace platform { +class MLUDeviceContext; +} // namespace platform + +namespace memory { +namespace allocation { + +/** + * MLUDeviceContextAllocation is a wrapper of the underbeneath allocation. + * MLUDeviceContextAllocation adds a MLU stream callback for the underbeneath + * allocation so that MLUDeviceContextAllocation can be used in a MLU stream + * which deletes allocation in the callback. + */ +class MLUDeviceContextAllocation : public Allocation { + public: + explicit MLUDeviceContextAllocation(AllocationPtr allocation) + : Allocation(allocation->ptr(), allocation->size(), allocation->place()), + underlying_allocation_(std::move(allocation)) {} + + ~MLUDeviceContextAllocation() { + PADDLE_ENFORCE_NOT_NULL( + dev_ctx_, + platform::errors::PreconditionNotMet( + "Device context is not set for MLUDeviceContextAllocation")); + auto *p_allocation = underlying_allocation_.release(); + VLOG(4) << "Adding callback to delete MLUDeviceContextAllocation at " + << p_allocation; + dev_ctx_->AddStreamCallback([p_allocation] { + VLOG(4) << "Delete MLUDeviceContextAllocation at " << p_allocation; + Allocator::AllocationDeleter(p_allocation); + }); + } + + void SetMLUDeviceContext(const platform::MLUDeviceContext *dev_ctx) { + dev_ctx_ = dev_ctx; + } + + private: + AllocationPtr underlying_allocation_; + const platform::MLUDeviceContext *dev_ctx_{nullptr}; +}; + +/** + * MLUDeviceContextAllocator will allocate a MLUDeviceContextAllocation + * after waiting for a self-created event on the default stream. It does so to + * let the non-default stream be able to allocate GPU memory which will be + * released by stream callback + */ +class MLUDeviceContextAllocator : public Allocator { + public: + explicit MLUDeviceContextAllocator(platform::MLUPlace place, + mluStream default_stream) + : place_(place), default_stream_(default_stream) { + platform::MLUDeviceGuard guard(place_.device); + PADDLE_ENFORCE_MLU_SUCCESS(cnrtNotifierCreate(&event_)); + } + + ~MLUDeviceContextAllocator() { + if (event_) { + platform::MLUDeviceGuard guard(place_.device); + PADDLE_ENFORCE_MLU_SUCCESS(cnrtNotifierDestroy(event_)); + } + } + + protected: + pten::Allocation *AllocateImpl(size_t size) override { + PADDLE_ENFORCE_NOT_NULL( + default_stream_, + platform::errors::PreconditionNotMet( + "Default stream is not set for MLUDeviceContextAllocator")); + platform::MLUDeviceGuard guard(place_.device); + auto allocation = + new MLUDeviceContextAllocation(memory::Alloc(place_, size)); + // Wait for the event on stream + PADDLE_ENFORCE_MLU_SUCCESS(cnrtPlaceNotifier(event_, default_stream_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnrtWaitNotifier(event_)); + return allocation; + } + + void FreeImpl(pten::Allocation *allocation) override { delete allocation; } + + private: + platform::MLUPlace place_; + mluEventHandle event_{nullptr}; + mluStream default_stream_{nullptr}; +}; + +/** + * MLUDeviceContextAllocatorPool is a singletion stores mapping from + * MLUPlace(s) to std::shared_ptr. When a + * MLUDeviceContext's compute stream isn't default stream, it can call this + * class to allocate GPU memory which will be released by a callback after + * stream execution. + */ +class MLUDeviceContextAllocatorPool { + public: + static MLUDeviceContextAllocatorPool &Instance() { + static MLUDeviceContextAllocatorPool pool; + return pool; + } + + AllocationPtr Alloc(const platform::MLUDeviceContext &dev_ctx, size_t size) { + auto iter = allocators_.find( + BOOST_GET_CONST(platform::MLUPlace, dev_ctx.GetPlace())); + PADDLE_ENFORCE_NE( + iter, allocators_.end(), + platform::errors::NotFound("No allocator found for MLUPlace.")); + auto &allocator = iter->second; + AllocationPtr allocation = allocator->Allocate(size); + static_cast(allocation.get()) + ->SetMLUDeviceContext(&dev_ctx); + return allocation; + } + + private: + MLUDeviceContextAllocatorPool() { + std::vector devices = platform::GetMLUSelectedDevices(); + for (int i : devices) { + auto place = platform::MLUPlace(i); + auto compute_stream = + platform::DeviceContextPool::Instance().GetByPlace(place)->stream(); + auto allocator = std::shared_ptr( + new MLUDeviceContextAllocator(place, compute_stream)); + allocators_.insert(make_pair(place, allocator)); + } + } + + std::map> + allocators_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h index e83057e682..c049da3b33 100644 --- a/paddle/fluid/platform/device/npu/npu_op_runner.h +++ b/paddle/fluid/platform/device/npu/npu_op_runner.h @@ -158,8 +158,7 @@ void FillNpuTensorWithConstant(Tensor *tensor, T val) { paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(npu_pinned_place) .get()); - paddle::memory::allocation::Allocation *allocation = - npu_pinned_tensor.Holder().get(); + pten::Allocation *allocation = npu_pinned_tensor.Holder().get(); npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream()); } else { diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index e69de29bb2..659df6b9b4 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -0,0 +1,174 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +// disable numpy compile error +#include + +#include +#include + +#include "pybind11/numpy.h" +#include "pybind11/pybind11.h" + +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/all.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/backward.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/api/lib/utils/storage.h" +#include "paddle/pten/api/lib/utils/tensor_utils.h" +#include "paddle/pten/common/data_type.h" +#include "paddle/pten/core/convert_utils.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/include/core.h" + +namespace paddle { +namespace pybind { + +namespace py = ::pybind11; + +extern PyTypeObject* p_eager_tensor_type; + +size_t PyArray_Size_(PyObject* numpy_data) { + size_t res = 1; + auto dims = pybind11::detail::array_proxy(numpy_data)->dimensions; + auto nd = pybind11::detail::array_proxy(numpy_data)->nd; + while (nd--) { + res *= (*dims++); + } + return res; +} + +class EagerNumpyAllocation : public pten::Allocation { + public: + explicit EagerNumpyAllocation(PyObject* numpy_data, pten::DataType dtype) + : Allocation( + static_cast(pybind11::detail::array_proxy(numpy_data)->data), + pten::DataTypeSize(dtype) * PyArray_Size_(numpy_data), + paddle::platform::CPUPlace()), + arr_(numpy_data) { + PADDLE_ENFORCE_NOT_NULL(arr_, platform::errors::InvalidArgument( + "The underlying PyObject pointer of " + "numpy array cannot be nullptr")); + PADDLE_ENFORCE_NE( + arr_, Py_None, + platform::errors::PreconditionNotMet( + "The underlying PyObject pointer of numpy array cannot be None")); + Py_INCREF(arr_); + } + ~EagerNumpyAllocation() override { + py::gil_scoped_acquire gil; + Py_DECREF(arr_); + } + + private: + PyObject* arr_; +}; + +static PyObject* eager_api_set_expected_place(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0); + egr::Controller::Instance().SetExpectedPlace(place); + + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* eager_api_get_expected_place(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + return ToPyObject(egr::Controller::Instance().GetExpectedPlace()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* eager_api_scale(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + // TODO(jiabin): Sync Tensor and Variable here when we support + egr::EagerTensor ret = + egr::scale(reinterpret_cast(PyTuple_GET_ITEM(args, 0)) + ->eager_tensor, + CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 1), 1), + CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 2), 2), + CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3), + CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4)); + return ToPyObject(ret); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* eager_api_run_backward(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto tensors = CastPyArg2VectorOfEagerTensor(PyTuple_GET_ITEM(args, 0), 0); + auto grad_tensors = + CastPyArg2VectorOfEagerTensor(PyTuple_GET_ITEM(args, 1), 1); + RunBackward(tensors, grad_tensors, + CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2)); + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + egr::EagerTensor& src = + reinterpret_cast(PyTuple_GET_ITEM(args, 0)) + ->eager_tensor; + egr::EagerTensor& dst = + reinterpret_cast(PyTuple_GET_ITEM(args, 1)) + ->eager_tensor; + auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2); + bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3); + + dst = src.copy_to(pten::TransToPtenBackend(place), blocking); + egr::EagerUtils::autograd_meta(&dst)->SetStopGradient( + egr::EagerUtils::autograd_meta(&(src))->StopGradient()); + egr::EagerUtils::autograd_meta(&dst)->SetPersistable( + egr::EagerUtils::autograd_meta(&(src))->Persistable()); + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +PyMethodDef variable_functions[] = { + {"scale", (PyCFunction)(void (*)(void))eager_api_scale, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_set_expected_place", + (PyCFunction)(void (*)(void))eager_api_set_expected_place, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_get_expected_place", + (PyCFunction)(void (*)(void))eager_api_get_expected_place, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy, + METH_VARARGS | METH_KEYWORDS, NULL}, + {NULL, NULL, 0, NULL}}; + +void BindFunctions(PyObject* module) { + if (PyModule_AddFunctions(module, variable_functions) < 0) { + PADDLE_THROW(platform::errors::Fatal( + "Init Paddle erroe in BindFunctions(PyModule_AddFunctions).")); + return; + } +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/pten/api/lib/utils/CMakeLists.txt b/paddle/pten/api/lib/utils/CMakeLists.txt index 4a44ad7758..a4db8c4b19 100644 --- a/paddle/pten/api/lib/utils/CMakeLists.txt +++ b/paddle/pten/api/lib/utils/CMakeLists.txt @@ -1,2 +1,2 @@ -cc_library(pten_api_utils SRCS allocator.cc storage.cc tensor_utils.cc DEPS +cc_library(pten_api_utils SRCS storage.cc tensor_utils.cc DEPS tensor_base convert_utils dense_tensor lod_tensor selected_rows place var_type_traits) diff --git a/paddle/pten/api/lib/utils/allocator.h b/paddle/pten/api/lib/utils/allocator.h new file mode 100644 index 0000000000..a8c05b7651 --- /dev/null +++ b/paddle/pten/api/lib/utils/allocator.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/pten/core/allocator.h" +#include "paddle/pten/core/storage.h" + +namespace paddle { +namespace experimental { + +class DefaultAllocator : public pten::deprecated::Allocator { + public: + using Allocation = pten::deprecated::Allocation; + explicit DefaultAllocator(const paddle::platform::Place& place) + : place_(place) {} + + static void Delete(Allocation* allocation) { + paddle::memory::allocation::Allocator::AllocationDeleter( + allocation->CastContextWithoutCheck()); + } + + Allocation Allocate(size_t bytes_size) override { + paddle::memory::AllocationPtr a = memory::Alloc(place_, bytes_size); + void* ptr = a->ptr(); + return Allocation(ptr, a.release(), &Delete, place_); + } + + const paddle::platform::Place& place() override { return place_; } + + private: + paddle::platform::Place place_; +}; + +} // namespace experimental +} // namespace paddle diff --git a/paddle/pten/api/lib/utils/storage.cc b/paddle/pten/api/lib/utils/storage.cc new file mode 100644 index 0000000000..6116a709d5 --- /dev/null +++ b/paddle/pten/api/lib/utils/storage.cc @@ -0,0 +1,40 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/api/lib/utils/storage.h" + +namespace paddle { +namespace experimental { + +ExternalStorage::ExternalStorage(void* ptr, + size_t size, + const paddle::platform::Place& place) + : pten::Storage(std::make_shared(ptr, size, place)), + size_(size) {} + +ExternalStorage::ExternalStorage(const pten::intrusive_ptr& root, + size_t delta, + size_t size) + : Storage(std::make_shared( + static_cast(root->data()) + delta, size, root->place())), + size_(size) { + PADDLE_ENFORCE_LE(static_cast(delta + size), + root->size(), + paddle::platform::errors::InvalidArgument( + "The size of the external storage does " + "not meet the metadata requirements.")); +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc new file mode 100644 index 0000000000..0b6cb8d95c --- /dev/null +++ b/paddle/pten/api/lib/utils/tensor_utils.cc @@ -0,0 +1,521 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/api/lib/utils/tensor_utils.h" + +#include +#include + +#include "paddle/pten/core/compat_utils.h" + +namespace paddle { +namespace experimental { + +template +void SetLoD(DstLoD* dst, const SrcLoD& src) { + dst->reserve(src.size()); + dst->clear(); + for (auto&& v : src) { + dst->emplace_back(v); + } +} + +std::unique_ptr MakePtenDenseTensor( + const paddle::framework::Tensor& src) { + VLOG(3) << "MakePtenDenseTensor based Tensor."; + pten::DenseTensorMeta meta{pten::TransToPtenDataType(src.type()), + src.dims(), + src.layout(), + src.offset()}; + auto shared_storage = pten::make_intrusive(src.Holder()); + return std::make_unique(std::move(shared_storage), + std::move(meta)); +} + +std::unique_ptr MakePtenDenseTensor( + const paddle::framework::LoDTensor& src) { + auto out = + MakePtenDenseTensor(static_cast(src)); + SetLoD(&(pten::CompatibleDenseTensorUtils::GetMutableMeta(out.get())->lod), + src.lod()); + return std::move(out); +} + +std::unique_ptr MakePtenDenseTensor( + const paddle::framework::Tensor& src, const pten::TensorArgDef& arg_def) { + pten::DenseTensorMeta meta{ + arg_def.dtype, src.dims(), src.layout(), src.offset()}; + + if (src.IsInitialized() && + src.place() == pten::TransToFluidPlace(arg_def.backend)) { + auto shared_storage = pten::make_intrusive(src.Holder()); + return std::make_unique(std::move(shared_storage), + std::move(meta)); + } else { + return std::make_unique( + std::move(pten::make_intrusive( + pten::TransToFluidPlace(arg_def.backend))), + std::move(meta)); + } +} + +std::unique_ptr MakePtenDenseTensor( + const paddle::framework::LoDTensor& src, + const pten::TensorArgDef& arg_def) { + auto out = MakePtenDenseTensor( + static_cast(src), arg_def); + SetLoD(&(pten::CompatibleDenseTensorUtils::GetMutableMeta(out.get())->lod), + src.lod()); + return std::move(out); +} + +pten::Scalar MakePtenScalar(const paddle::framework::LoDTensor& src) { + PADDLE_ENFORCE_EQ(src.numel(), + 1, + paddle::platform::errors::InvalidArgument( + "The Scalar only supports Tensor with 1 element, " + "but now Tensor has %d element.", + src.numel())); + switch (src.type()) { + case paddle::framework::proto::VarType::FP32: + return {src.template data()[0]}; + case paddle::framework::proto::VarType::FP64: + return {src.template data()[0]}; + case paddle::framework::proto::VarType::FP16: + return {src.template data()[0]}; + case paddle::framework::proto::VarType::BF16: + return {src.template data()[0]}; + case paddle::framework::proto::VarType::INT32: + return {src.template data()[0]}; + case paddle::framework::proto::VarType::INT64: + return {src.template data()[0]}; + case paddle::framework::proto::VarType::INT16: + return {src.template data()[0]}; + case paddle::framework::proto::VarType::INT8: + return {src.template data()[0]}; + case paddle::framework::proto::VarType::UINT8: + return {src.template data()[0]}; + case paddle::framework::proto::VarType::BOOL: + return {src.template data()[0]}; + case paddle::framework::proto::VarType::COMPLEX64: + return {src.template data()[0]}; + case paddle::framework::proto::VarType::COMPLEX128: + return {src.template data()[0]}; + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Data type error. Don't support casting a %d LoDTensor to Scalar.", + src.type())); + } +} + +pten::Scalar MakePtenScalarFromVar(const framework::Variable& variable) { + auto expected_place = pten::TransToFluidPlace(pten::Backend::CPU); + if (variable.IsType()) { + const auto& tensor = variable.Get(); + if (!platform::is_same_place(tensor.place(), expected_place)) { + framework::LoDTensor tmp_tensor; + framework::TensorCopySync(tensor, expected_place, &tmp_tensor); + return MakePtenScalar(tmp_tensor); + } else { + return MakePtenScalar(tensor); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupport casting input `%s` type to Scalar when call pt " + "kernel.", + framework::ToTypeName(variable.Type()))); + } +} + +pten::ScalarArray MakePtenScalarArray(const paddle::framework::LoDTensor& src) { + if (src.type() == paddle::framework::proto::VarType::INT64) { + return {src.data(), src.numel()}; + } else if (src.type() == paddle::framework::proto::VarType::INT32) { + return {src.data(), src.numel()}; + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Data type error. When cast a LoDTensor to ScalarArray, " + "the data type of LoDTensor must be int32 or int64, " + "but now data type is %s.", + src.type())); + } +} + +pten::ScalarArray MakePtenScalarArrayFromVar( + const framework::Variable& variable) { + auto expected_place = pten::TransToFluidPlace(pten::Backend::CPU); + if (variable.IsType()) { + const auto& tensor = variable.Get(); + if (!platform::is_same_place(tensor.place(), expected_place)) { + framework::LoDTensor tmp_tensor; + framework::TensorCopySync(tensor, expected_place, &tmp_tensor); + return MakePtenScalarArray(tmp_tensor); + } else { + return MakePtenScalarArray(tensor); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupport casting input `%s` type to ScalarArray when call pt " + "kernel.", + framework::ToTypeName(variable.Type()))); + } +} + +pten::ScalarArray MakePtenScalarArrayFromVarList( + const std::vector& variable_list) { + if (variable_list.size() == 0) { + return pten::ScalarArray(); + } + auto expected_place = pten::TransToFluidPlace(pten::Backend::CPU); + + paddle::framework::proto::VarType::Type data_type; + auto* first_var = variable_list.front(); + if (first_var->IsType()) { + const auto& tensor = first_var->Get(); + data_type = tensor.type(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupport casting input `%s` type to VectorTensor when call pt " + "kernel.", + framework::ToTypeName(first_var->Type()))); + } + + std::vector vector_data; + vector_data.reserve(variable_list.size()); + + if (data_type == paddle::framework::proto::VarType::INT64) { + for (auto* var : variable_list) { + if (var->IsType()) { + const auto& tensor = var->Get(); + if (!platform::is_same_place(tensor.place(), expected_place)) { + framework::LoDTensor tmp_tensor; + framework::TensorCopySync(tensor, expected_place, &tmp_tensor); + vector_data.push_back(*tmp_tensor.data()); + } else { + vector_data.push_back(*tensor.data()); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupport casting input `%s` type to VectorTensor when call pt " + "kernel.", + framework::ToTypeName(var->Type()))); + } + } + + } else if (data_type == paddle::framework::proto::VarType::INT32) { + for (auto* var : variable_list) { + if (var->IsType()) { + const auto& tensor = var->Get(); + if (!platform::is_same_place(tensor.place(), expected_place)) { + framework::LoDTensor tmp_tensor; + framework::TensorCopySync(tensor, expected_place, &tmp_tensor); + vector_data.push_back(*tmp_tensor.data()); + } else { + vector_data.push_back(*tensor.data()); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupport casting input `%s` type to VectorTensor when call pt " + "kernel.", + framework::ToTypeName(var->Type()))); + } + } + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Data type error. When cast a LoDTensor to VectorTensor, " + "the data type of LoDTensor must be int32 or int64, " + "but now data type is %s.", + data_type)); + } + + return {vector_data}; +} + +std::unique_ptr MakePtenTensorBaseFromVar( + const framework::Variable& variable, const pten::TensorArgDef& arg_def) { + auto expected_place = pten::TransToFluidPlace(arg_def.backend); + + if (variable.IsType()) { + const auto& tensor = variable.Get(); + if (!platform::is_same_place(tensor.place(), expected_place)) { + framework::LoDTensor tmp_tensor; + framework::TensorCopySync(tensor, expected_place, &tmp_tensor); + return MakePtenDenseTensor(tmp_tensor); + } else { + return MakePtenDenseTensor(tensor); + } + } else if (variable.IsType()) { + // TODO(chenweihang): now we don't deal with row and height + // by xiaowei's advice + const auto& tensor = variable.Get(); + if (!platform::is_same_place(tensor.value().place(), expected_place)) { + framework::Tensor tmp_tensor; + TensorCopySync(tensor.value(), expected_place, &tmp_tensor); + // TODO(chenweihang): adapt SelectedRows by xiaowei's design + return MakePtenDenseTensor(tmp_tensor); + } else { + return MakePtenDenseTensor(tensor.value()); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported shared input `%s` type now when call pt kernel.", + framework::ToTypeName(variable.Type()))); + } + return {}; +} + +std::unique_ptr MakePtenTensorBaseFromVar( + framework::Variable* variable, const pten::TensorArgDef& arg_def) { + // mutable_data before run kernel, to avoid share output form + // KernelContext to original tensor + if (variable->template IsType()) { + auto* tensor = variable->template GetMutable(); + return MakePtenDenseTensor(*tensor, arg_def); + } else if (variable->template IsType()) { + auto* tensor = variable->template GetMutable(); + // TODO(chenweihang): adapt SelectedRows by xiaowei's design, + // here the row and height will lost in output! + return MakePtenDenseTensor(tensor->value(), arg_def); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported shared output `%s` type now when call pt kernel.", + framework::ToTypeName(variable->Type()))); + } + return {}; +} + +void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) { + PADDLE_ENFORCE_NOT_NULL( + src, + platform::errors::InvalidArgument( + "The source DenseTensor is nullptr when move storage.")); + PADDLE_ENFORCE_NOT_NULL( + dst, + platform::errors::InvalidArgument( + "The destination Tensor is nullptr when move storage.")); + dst->Resize(src->dims()); + dst->set_type(pten::TransToProtoVarType(src->dtype())); + auto storage = src->release(); + std::shared_ptr holder( + new TensorStorage(std::move(storage))); + dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->dtype())); + dst->set_offset(src->meta().offset); +} + +void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) { + MovesStorage(src, static_cast(dst)); + SetLoD(dst->mutable_lod(), src->lod()); +} + +void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) { + PADDLE_ENFORCE_NOT_NULL( + src, + platform::errors::InvalidArgument( + "The source DenseTensor is nullptr when move allocation.")); + PADDLE_ENFORCE_NOT_NULL( + dst, + platform::errors::InvalidArgument( + "The destination Tensor is nullptr when move allocation.")); + dst->Resize(src->dims()); + auto* storage = static_cast( + pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src)); + dst->ResetHolderWithType(storage->GetAllocation(), + pten::TransToProtoVarType(src->dtype())); + dst->set_offset(src->meta().offset); +} + +void SharesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) { + SharesStorage(src, static_cast(dst)); + SetLoD(dst->mutable_lod(), src->lod()); +} + +void ReMakePtenDenseTensor(const paddle::framework::Tensor& src, + pten::DenseTensor* dst) { + VLOG(3) << "ReMakePtenDenseTensor based Tensor."; + auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst); + meta->dims = src.dims(); + meta->dtype = pten::TransToPtenDataType(src.type()); + meta->layout = src.layout(); + meta->offset = src.offset(); + + auto* shared_storage = static_cast( + pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst)); + PADDLE_ENFORCE_NOT_NULL( + shared_storage, + platform::errors::NotFound( + "Target DenseTensor's shared storage is nullptr.")); + + PADDLE_ENFORCE_EQ(src.IsInitialized(), + true, + paddle::platform::errors::InvalidArgument( + "Source Tensor is not initialized.")); + shared_storage->ResetAllocation(src.Holder()); +} + +void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src, + pten::DenseTensor* dst) { + auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst); + SetLoD(&meta->lod, src.lod()); + ReMakePtenDenseTensor(static_cast(src), + dst); +} + +void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src, + const pten::TensorArgDef& arg_def, + pten::DenseTensor* dst) { + VLOG(3) << "ReMakePtenDenseTensor based Tensor and TensorArgDef."; + auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst); + meta->dims = src.dims(); + meta->dtype = arg_def.dtype; + meta->layout = src.layout(); + meta->offset = src.offset(); + + auto* shared_storage = static_cast( + pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst)); + PADDLE_ENFORCE_NOT_NULL( + shared_storage, + platform::errors::NotFound( + "Target DenseTensor's shared storage is nullptr.")); + + if (src.IsInitialized() && + src.place() == pten::TransToFluidPlace(arg_def.backend)) { + shared_storage->ResetAllocation(src.Holder()); + } else { + shared_storage->ResetAllocationPlace( + pten::TransToFluidPlace(arg_def.backend)); + } +} + +void ReMakePtenDenseTensorByArgDef(const paddle::framework::LoDTensor& src, + const pten::TensorArgDef& arg_def, + pten::DenseTensor* dst) { + auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst); + SetLoD(&meta->lod, src.lod()); + ReMakePtenDenseTensorByArgDef( + static_cast(src), arg_def, dst); +} + +void ReMakePtenDenseTensorFromVar(const framework::Variable& variable, + const pten::TensorArgDef& arg_def, + pten::DenseTensor* dst) { + auto expected_place = pten::TransToFluidPlace(arg_def.backend); + if (variable.IsType()) { + const auto& tensor = variable.Get(); + // check input dtype before ReMakePtenDenseTensor + PADDLE_ENFORCE( + (arg_def.dtype == pten::TransToPtenDataType(tensor.type())), + paddle::platform::errors::InvalidArgument( + "The type of input data is diffrent from the type of the " + "argument's definition in kernel.")); + if (!platform::is_same_place(tensor.place(), expected_place)) { + framework::LoDTensor tmp_tensor; + framework::TensorCopySync(tensor, expected_place, &tmp_tensor); + ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst); + } else { + ReMakePtenDenseTensorByArgDef(tensor, arg_def, dst); + } + } else if (variable.IsType()) { + // TODO(chenweihang): now we don't deal with row and height + // by xiaowei's advice + const auto& tensor = variable.Get(); + PADDLE_ENFORCE( + (arg_def.dtype == pten::TransToPtenDataType(tensor.value().type())), + paddle::platform::errors::InvalidArgument( + "The type of input data is diffrent from the type of the " + "argument's definition in kernel.")); + if (!platform::is_same_place(tensor.value().place(), expected_place)) { + framework::Tensor tmp_tensor; + TensorCopySync(tensor.value(), expected_place, &tmp_tensor); + // TODO(chenweihang): adapt SelectedRows by xiaowei's design + ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst); + } else { + ReMakePtenDenseTensorByArgDef(tensor.value(), arg_def, dst); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported shared input `%s` type now when call pt kernel.", + framework::ToTypeName(variable.Type()))); + } +} + +void ReMakePtenDenseTensorFromVar(framework::Variable* variable, + const pten::TensorArgDef& arg_def, + pten::DenseTensor* dst) { + // mutable_data before run kernel, to avoid share output form + // KernelContext to original tensor + if (variable->template IsType()) { + auto* tensor = variable->template GetMutable(); + ReMakePtenDenseTensorByArgDef(*tensor, arg_def, dst); + } else if (variable->template IsType()) { + auto* tensor = variable->template GetMutable(); + // TODO(chenweihang): adapt SelectedRows by xiaowei's design, + // here the row and height will lost in output! + ReMakePtenDenseTensorByArgDef(tensor->value(), arg_def, dst); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported shared output `%s` type now when call pt kernel.", + framework::ToTypeName(variable->Type()))); + } +} + +static bool IsSameAllocation(const std::shared_ptr& a, + const std::shared_ptr& b) { + return a->ptr() == b->ptr() && a->size() == b->size() && + platform::is_same_place(a->place(), b->place()); +} + +void MakeVariableFromPtenTensor(pten::DenseTensor* src, + framework::Variable* variable) { + if (variable->IsType()) { + auto* tensor = variable->GetMutable(); + + auto dtype = pten::TransToProtoVarType(src->dtype()); + tensor->Resize(src->dims()); + SetLoD(tensor->mutable_lod(), src->lod()); + + // here dynamic_cast is slow + auto* storage = static_cast( + pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src)); + + if (!tensor->IsInitialized() || + (tensor->IsInitialized() && + !IsSameAllocation(tensor->Holder(), storage->GetAllocation()))) { + tensor->ResetHolderWithType(std::move(storage->GetAllocation()), dtype); + } else { + // Even the pten tensor and Variable have the same Alloctation (both have + // the same pointer address, same size and same place) + // but there is possible that they do not have the same data_type. + // so, here we set the variable's type with the pten tensor dtype. + tensor->set_type(dtype); + } + + } else if (variable->IsType()) { + auto* tensor = variable->GetMutable(); + auto dtype = pten::TransToProtoVarType(src->dtype()); + + if (!tensor->value().IsInitialized()) { + auto storage = dynamic_cast( + pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src)); + tensor->mutable_value()->ResetHolderWithType( + std::move(storage->GetAllocation()), dtype); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported shared input `%s` type now when call pt kernel.", + framework::ToTypeName(variable->Type()))); + } +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/pten/core/allocator.h b/paddle/pten/core/allocator.h index e69de29bb2..2647490c9f 100644 --- a/paddle/pten/core/allocator.h +++ b/paddle/pten/core/allocator.h @@ -0,0 +1,153 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/place.h" +#include "paddle/pten/core/candidate/allocator.h" + +namespace pten { +namespace deprecated { + +/// \brief Encapsulates strategies for access/addressing, allocation/ +/// deallocation and construction/destruction of objects. +class RawAllocator { + public: + using Place = paddle::platform::Place; + + /// \brief Default destructor. + virtual ~RawAllocator() = default; + + /// \brief Allocates storage suitable for an array object of n bytes + /// and creates the array, but does not construct array elements. + /// May throw exceptions. + /// \param bytes_size The number of bytes to allocate. + /// \return The first address allocated. + virtual void* Allocate(size_t bytes_size) = 0; + + /// \brief Deallocates storage pointed to ptr, which must be a value + /// returned by a previous call to allocate that has not been + /// invalidated by an intervening call to deallocate. The bytes_size + /// must match the value previously passed to allocate. + /// \param ptr The first address to deallocate. + /// \param bytes_size The number of bytes to deallocate. + virtual void Deallocate(void* ptr, size_t bytes_size) = 0; + + /// \brief Get the place value of the allocator and the allocation. + /// \return The place value of the allocator and the allocation. + virtual const Place& place() const = 0; +}; + +/// \brief Fancy pointer with context. The use of this data type +/// is to be compatible with allocators from different frameworks +/// without significant performance loss. This class does not +/// support being inherited. +class Allocation final { + public: + using Place = paddle::platform::Place; + using DeleterFnPtr = void (*)(Allocation*); + + Allocation() = default; + + // Don't own resources, only provide access. + Allocation(void* data, const Place& place) : data_(data), place_(place) {} + + // Own resources. + Allocation(void* data, void* ctx, DeleterFnPtr deleter, const Place& place) + : data_(data), ctx_(ctx), deleter_(deleter), place_(place) {} + + Allocation(Allocation&& other) { swap(*this, other); } + Allocation& operator=(Allocation&& other) { + // Exchange them explicitly to avoid moving is equivalent + // to copying. + swap(*this, other); + return *this; + } + ~Allocation() { Clear(); } + + void* ptr() const noexcept { return data_; } + void* operator->() const noexcept { return data_; } + operator bool() const noexcept { return data_ || ctx_; } + const Place& place() const noexcept { return place_; } + + void Clear() { + if (deleter_) { + deleter_(this); + } + ctx_ = nullptr; + deleter_ = nullptr; + data_ = nullptr; + } + + DeleterFnPtr deleter() const noexcept { return deleter_; } + + template + T* CastContextWithoutCheck() const noexcept { + return static_cast(ctx_); + } + + /// \brief Statically cast the void pointer of the context object to + /// the primitive type. Conversion of any pointer to void* and back + /// to pointer to the original cv type preserves its original value. + /// \param T The primitive type name of the context pointer. + /// \param expected_deleter The destructor passed in to enhance type + /// safety checking. + template + T* CastContext(DeleterFnPtr expected_deleter) const { + PADDLE_ENFORCE_EQ( + deleter_ == expected_deleter, + true, + paddle::platform::errors::InvalidArgument( + "The deleter of the allocation does not match, so the pointer " + "cannot be safely removed.")); + return CastContextWithoutCheck(); + } + + private: + friend void swap(Allocation& a, Allocation& b) noexcept; + void* data_{nullptr}; + void* ctx_{nullptr}; + DeleterFnPtr deleter_{nullptr}; + // TODO(Shixiaowei02): Enum needs to be used instead to reduce + // the construction overhead by more than 50%. + Place place_; +}; + +inline void swap(Allocation& a, Allocation& b) noexcept { + ::std::swap(a.data_, b.data_); + ::std::swap(a.ctx_, b.ctx_); + ::std::swap(a.deleter_, b.deleter_); + ::std::swap(a.place_, b.place_); +} + +/// \brief Context compatible allocator interface. This allocator is +/// mainly used for general data structures such as Tensor. The raw +/// allocator is more universal and efficient. +class Allocator { + using Place = paddle::platform::Place; + + public: + virtual ~Allocator() = default; + virtual Allocation Allocate(size_t bytes_size) = 0; + virtual const Place& place() = 0; +}; + +inline Allocation Allocate(const std::shared_ptr& a, size_t n) { + CHECK(a); + return a->Allocate(n); +} + +} // namespace deprecated +} // namespace pten diff --git a/paddle/pten/core/candidate/allocator.h b/paddle/pten/core/candidate/allocator.h new file mode 100644 index 0000000000..75d42c4fd1 --- /dev/null +++ b/paddle/pten/core/candidate/allocator.h @@ -0,0 +1,107 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/platform/place.h" + +namespace pten { + +/// \brief Fancy pointer with deleter. The use of this data type +/// is to be compatible with allocators from different frameworks +/// without significant performance loss. This class does not +/// support being inherited. +class Allocation { + public: + using Place = paddle::platform::Place; + using DeleterFnPtr = void (*)(Allocation*); + + Allocation() = default; + + // Don't own resources, only provide access. + Allocation(void* data, size_t size, const Place& place) + : ptr_(data), size_(size), place_(place) {} + + // Own resources. + Allocation(void* data, size_t size, DeleterFnPtr deleter, const Place& place) + : ptr_(data), size_(size), deleter_(deleter), place_(place) {} + + Allocation(Allocation&& other) noexcept { swap(*this, other); } + Allocation& operator=(Allocation&& other) noexcept { + // Exchange them explicitly to avoid moving is equivalent + // to copying. + swap(*this, other); + return *this; + } + + virtual ~Allocation() { + if (deleter_) { + deleter_(this); + } + } + + // Returns the holding pointer. + // NOTE: For performance consideration, it is better not to make this method + // as a virtual method. If we want to implement a `defragmentation` later, + // we might need to make `ptr_` field as a protected field, and add a virtual + // method like `defragmentation` to change `ptr_`. + void* ptr() const noexcept { return ptr_; } + + // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the + // last valid element. + // + // NOTE: Some allocator might alloc more memory than request. The size + // could larger than its request. For example, + // the AlignedAllocator will always allocate memory as size + kAlignment. + // The raw pointer might not aligned, so an offset might be added to raw + // the pointer. The size of this allocation will be + // `size + kAlignemnt - offset`. + size_t size() const noexcept { return size_; } + + void* operator->() const noexcept { return ptr_; } + operator bool() const noexcept { return ptr_; } + const Place& place() const noexcept { return place_; } + DeleterFnPtr deleter() const noexcept { return deleter_; } + + protected: + friend void swap(Allocation& a, Allocation& b) noexcept; + void* ptr_{nullptr}; + size_t size_{}; + DeleterFnPtr deleter_{nullptr}; + // TODO(Shixiaowei02): Enum needs to be used instead to reduce + // the construction overhead by more than 50%. + Place place_; +}; + +inline void swap(Allocation& a, Allocation& b) noexcept { + ::std::swap(a.ptr_, b.ptr_); + ::std::swap(a.deleter_, b.deleter_); + ::std::swap(a.place_, b.place_); + ::std::swap(a.size_, b.size_); +} + +class Allocator { + public: + using DeleterType = std::function; + using AllocationPtr = std::unique_ptr; + + virtual ~Allocator() = default; + virtual AllocationPtr Allocate(size_t bytes_size) = 0; + + virtual bool IsAllocThreadSafe() const { return false; } +}; + +} // namespace pten diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h index 1502accd19..1802a24611 100644 --- a/paddle/pten/core/dense_tensor.h +++ b/paddle/pten/core/dense_tensor.h @@ -60,6 +60,8 @@ class TensorInplaceVersion { class DenseTensor : public TensorBase, public TypeInfoTraits { public: + using Allocator = deprecated::Allocator; + /// \brief Construct a dense tensor and allocate space. /// \param a The allocator used to allocate space. /// \param meta The meta data of dense tensor. diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h index fc56935eea..cf18dd9130 100644 --- a/paddle/pten/core/storage.h +++ b/paddle/pten/core/storage.h @@ -91,6 +91,7 @@ class Storage : public intrusive_ref_counter { class TensorStorage : public Storage { public: using Place = paddle::platform::Place; + using Allocator = deprecated::Allocator; explicit TensorStorage(const std::shared_ptr& a) : alloc_(a) {} diff --git a/paddle/pten/tests/core/allocator.h b/paddle/pten/tests/core/allocator.h index e69de29bb2..c2c74e1aac 100644 --- a/paddle/pten/tests/core/allocator.h +++ b/paddle/pten/tests/core/allocator.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/pten/core/allocator.h" + +namespace pten { +namespace tests { + +class HostAllocatorSample : public pten::deprecated::RawAllocator { + public: + using Place = paddle::platform::Place; + void* Allocate(size_t bytes_size) override { + return ::operator new(bytes_size); + } + void Deallocate(void* ptr, size_t bytes_size) override { + return ::operator delete(ptr); + } + const Place& place() const override { return place_; } + + private: + Place place_{paddle::platform::CPUPlace()}; +}; + +class FancyAllocator : public pten::deprecated::Allocator { + public: + using Allocation = pten::deprecated::Allocation; + static void Delete(Allocation* allocation) { + ::operator delete(allocation->ptr()); + } + + Allocation Allocate(size_t bytes_size) override { + void* data = ::operator new(bytes_size); + return Allocation(data, data, &Delete, place()); + } + + const paddle::platform::Place& place() override { return place_; } + + paddle::platform::Place place_ = paddle::platform::CPUPlace(); +}; + +template +struct CustomAllocator { + using value_type = T; + using Allocator = pten::deprecated::RawAllocator; + + explicit CustomAllocator(const std::shared_ptr& a) noexcept + : alloc_(a) {} + + CustomAllocator(const CustomAllocator&) noexcept = default; + T* allocate(std::size_t n) { + return static_cast(alloc_->Allocate(n * sizeof(T))); + } + void deallocate(T* p, std::size_t n) { + return alloc_->Deallocate(p, sizeof(T) * n); + } + + template + friend bool operator==(const CustomAllocator&, + const CustomAllocator&) noexcept; + template + friend bool operator!=(const CustomAllocator&, + const CustomAllocator&) noexcept; + + private: + std::shared_ptr alloc_; +}; + +template +inline bool operator==(const CustomAllocator& lhs, + const CustomAllocator& rhs) noexcept { + return &lhs.alloc_ == &rhs.alloc_; +} + +template +inline bool operator!=(const CustomAllocator& lhs, + const CustomAllocator& rhs) noexcept { + return &lhs.alloc_ != &rhs.alloc_; +} + +} // namespace tests +} // namespace pten diff --git a/paddle/pten/tests/core/test_allocator.cc b/paddle/pten/tests/core/test_allocator.cc index e69de29bb2..94ba9a1e1b 100644 --- a/paddle/pten/tests/core/test_allocator.cc +++ b/paddle/pten/tests/core/test_allocator.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/generator.h" +#include "paddle/pten/tests/core/allocator.h" +#include "paddle/pten/tests/core/random.h" +#include "paddle/pten/tests/core/timer.h" + +namespace pten { +namespace tests { + +using RawAllocator = pten::deprecated::RawAllocator; +using Allocator = pten::deprecated::Allocator; +using Allocation = pten::deprecated::Allocation; + +template +bool host_allocator_test(size_t vector_size) { + std::vector src(vector_size); + std::generate(src.begin(), src.end(), make_generator(src)); + std::vector> dst( + src.begin(), + src.end(), + CustomAllocator(std::make_shared())); + return std::equal(src.begin(), src.end(), dst.begin()); +} + +TEST(raw_allocator, host) { + CHECK(host_allocator_test(1000)); + CHECK(host_allocator_test(1000)); + CHECK(host_allocator_test(1000)); +} + +class StorageRawAlloc { + public: + StorageRawAlloc(const std::shared_ptr& a, size_t size) + : alloc_(a) { + data_ = alloc_->Allocate(size); + } + ~StorageRawAlloc() { alloc_->Deallocate(data_, size); } + + private: + void* data_; + size_t size; + std::shared_ptr alloc_; +}; + +class StorageFancyAlloc { + public: + StorageFancyAlloc(const std::shared_ptr& a, size_t size) + : alloc_(a), allocation_(a->Allocate(size)) {} + + private: + std::shared_ptr alloc_; + Allocation allocation_; +}; + +TEST(benchmark, allocator) { + std::shared_ptr raw_allocator(new HostAllocatorSample); + std::shared_ptr fancy_allocator(new FancyAllocator); + const size_t cycles = 100; + Timer timer; + double t1{}, t2{}; + for (size_t i = 0; i < cycles; ++i) { + timer.tic(); + for (size_t i = 0; i < cycles; ++i) { + StorageRawAlloc(raw_allocator, i * 100); + } + t1 += timer.toc(); + timer.tic(); + for (size_t i = 0; i < cycles; ++i) { + StorageFancyAlloc(fancy_allocator, i * 100); + } + t2 += timer.toc(); + } + std::cout << "The cost of raw alloc is " << t1 << "ms.\n"; + std::cout << "The cost of fancy alloc with place is " << t2 << "ms.\n"; +} + +} // namespace tests +} // namespace pten diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index e0ae600819..caacecf446 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -226,7 +226,7 @@ if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then HAS_MODIFIED_ALLOCATION=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/memory/allocation" || true` if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="You must be approved by zhiqiu and Shixiaowei02 for paddle/fluid/memory/allocation.\nIt is being modularized and refactored. Thanks!\n" - check_approval 2 6888866 39303645 + check_approval 1 6888866 39303645 fi HAS_MODIFIED_TENSOR=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/framework/tensor" || true` @@ -241,23 +241,6 @@ if [ "${HAS_MODIFIED_TENSOR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then check_approval 1 22561442 22334008 fi -ALLOCSHARED_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH |grep -E "*\.(h|cc)" || true` -if [ "${ALLOCSHARED_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then - ERROR_LINES="" - for TEST_FILE in ${ALLOCSHARED_FILE_CHANGED}; - do - HAS_SKIP_CHECK_ALLOC_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "AllocShared" || true` - if [ "${HAS_SKIP_CHECK_ALLOC_CI}" != "" ]; then - ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${HAS_SKIP_CHECK_ALLOC_CI}\n" - fi - done - if [ "${ERROR_LINES}" != "" ]; then - ERROR_LINES=${ERROR_LINES//+/'\n+\t'} - echo_line="memory::AllocShared is not recommended, because it is being modularized and refactored. Please use memory::Alloc here. Otherwise, please request zhiqiu and Shixiaowei02 review and approve.\n" - check_approval 2 6888866 39303645 - fi -fi - ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true` if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n" -- Gitee